Beispiel #1
0
    def run_job(self, input_dict):
        """Run the emr job."""
        self.construct_job(input_dict)
        name = '{}_{}_training_{}'.format(self.env, self.ecosystem,
                                          self.current_time)

        bootstrap_uri = 's3://{bucket}/bootstrap.sh'.format(
            bucket=self.bucket_name)

        log_file_name = '{}.log'.format(name)

        log_uri = 's3://{bucket}/{log_file}'.format(
            bucket='{}-automated-analytics-spark-jobs'.format(self.env),
            log_file=log_file_name)

        emr_config_obj = EMRConfig(name=name,
                                   s3_bootstrap_uri=bootstrap_uri,
                                   training_repo_url=self.training_repo_url,
                                   log_uri=log_uri,
                                   ecosystem=self.ecosystem,
                                   properties=self.properties,
                                   hyper_params=self.hyper_params)

        configs = emr_config_obj.get_config()
        status = self.aws_emr.run_flow(configs)
        logger.info("EMR job is running {}".format(status))
        status_code = status.get('ResponseMetadata', {}).get('HTTPStatusCode')
        if status_code != 200:
            logger.error(
                "EMR Job Failed with the status code {}".format(status_code),
                extra={"status": status})
        return status
Beispiel #2
0
 def from_github(self, package, url_df, api_url, api_token):
     """Find the keywords from the Github Graph QL."""
     url_ = self.utility.get_url(url_df, package)
     keywords = list()
     if type(url_) == str:
         query_params = self.utility.get_query_params(url_)
         logger.info("Query Parameters are: {}, {}".format(
             query_params[0], query_params[1]))
         json = {
             'query':
             '{{organization(login: "******"){{name url repository(name: "{1}")\
             {{name url description repositoryTopics(first: 10){{nodes{{topic {{name}}}}}}}}}}}}'
             .format(str(query_params[0]), str(query_params[1]))
         }
         headers = {'Authorization': 'token %s' % api_token}
         try:
             response = requests.post(url=api_url,
                                      json=json,
                                      headers=headers)
             keywords = list(self.clean_response(response.json()))
             return keywords
         except Exception:
             logger.error(
                 "Either Github token is not present or response is not coming."
             )
             return keywords
     else:
         return keywords
    def __init__(self, query_job_config=None, credential_path=None):
        """Initialize the BigqueryBuilder object."""
        self.original_credential_path = os.getenv('GOOGLE_APPLICATION_CREDENTIALS') \
                               or credential_path

        try:
            json.loads(self.original_credential_path)
            json_credentials = True
        except Exception as e:
            logger.error("Not JSON credentials, reverting to local env JSON file: {}".format(e))
            json_credentials = False

        if json_credentials:
            tfile = tempfile.NamedTemporaryFile(mode='w+', delete=False)
            tfile.write(self.original_credential_path)
            tfile.flush()
            tfile.seek(0)
            self.new_credential_path = tfile.name
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.new_credential_path
        else:
            self.new_credential_path = self.original_credential_path

        if isinstance(query_job_config, bigquery.job.QueryJobConfig):
            self.query_job_config = query_job_config
        else:
            self.query_job_config = bigquery.job.QueryJobConfig()

        self.client = None

        if self.new_credential_path:
            self.client = bigquery.Client(
                default_query_job_config=self.query_job_config)
        else:
            raise ValueError("Please provide the the valid credential_path")
Beispiel #4
0
 def list_bucket_objects(self):
     """List all the objects in bucket."""
     try:
         return self._s3.Bucket(self.bucket_name).objects.all()
     except Exception as exc:
         logger.error(
             "An Exception occurred while listing objects in bucket\n {}".
             format(str(exc)))
 def get_status(self, cluster_id):
     """Get the status of EMR Instance."""
     try:
         cluster = self._emr.describe_cluster(ClusterId=cluster_id)
         return cluster.get('Cluster', {}).get('Status')
     except ClientError:
         logger.error("Unable to get the cluster info",
                      extra={"cluster_id": cluster_id})
 def upload_file(self, src, target):
     """Upload file into S3 Bucket."""
     try:
         return self._s3.Bucket(self.bucket_name).upload_file(src, target)
     except Exception as exc:
         logger.error(
             "An Exception occurred while uploading a file \n{}".format(
                 str(exc)))
 def list_bucket_keys(self):
     """List all the keys in bucket."""
     try:
         return [i.key for i in self.list_bucket_objects()]
     except Exception as exc:
         logger.error(
             "An Exception occurred while listing bucket keys\n {}".format(
                 str(exc)))
 def read_generic_file(self, filename):
     """Retrieve remote object content."""
     try:
         return self._s3.Object(self.bucket_name,
                                filename).get()['Body'].read()
     except Exception as exc:
         logger.error(
             "An Exception occurred while retrieving an object\n {}".format(
                 str(exc)))
def load_hyper_params():
    """Load the hyper parameter from the command line args."""
    if len(argv) > 1:
        input_data = argv[1:]
        try:
            if input_data:
                return loads(input_data[0])
        except Exception:
            logger.error("Unable to decode the hyper params")
def check_field_exists(input_data, fields):
    """Check field exist in the input data."""
    if isinstance(input_data, dict):
        for field in fields:
            if not input_data.get(field):
                logger.error(
                    "Please provide the valid value for the field {}".format(field))
    if isinstance(input_data, (list, dict, set, frozenset)):
        return list(set(fields).difference(set(input_data)))
    raise ValueError
Beispiel #11
0
 def from_existing_df(self, df_, package):
     """Find the keywords from existing dump."""
     if not df_.empty:
         data_lst = df_.loc[
             df_['name'] == str(package),
             ['name', 'description', 'keywords', 'dependencies']].iloc[0]
         return data_lst
     else:
         logger.error("Node Package details Dataframe is not existed.")
         return self.df_
Beispiel #12
0
 def read_pickle_file(self, filename):
     """Read Pickle file from the S3 bucket."""
     try:
         pickle_content = pickle.loads(self.read_generic_file(filename))
         return pickle_content
     except ValueError:
         logger.error("Not a valid pickle file provided.")
     except Exception as exc:
         logger.error(
             "An Exception occurred while retrieving a pickle file \n{}".
             format(str(exc)))
Beispiel #13
0
 def s3_delete_object(self, object_key):
     """Delete a object in bucket."""
     try:
         return self._s3.Bucket(self.bucket_name).delete_objects(
             Delete={"Objects": [{
                 'Key': object_key
             }]})
     except Exception as exc:
         logger.error(
             "An Exception occurred while deleting object\n {}".format(
                 str(exc)))
Beispiel #14
0
 def s3_clean_bucket(self):
     """Clean the bucket."""
     try:
         all_keys = self.list_bucket_keys()
         self.s3_delete_objects(all_keys)
         logger.info("`{}` bucket has been cleaned.".format(
             self.bucket_name))
     except Exception as exc:
         logger.error(
             "An Exception occurred while cleaning the bucket\n {}".format(
                 str(exc)))
    def load_matlab_multi_matrix(self, local_filename):
        """Load a '.mat'file & return a dict representation.

        :local_filename: The path of the object.
        :returns: A dict containing numpy matrices against the keys of the
                  multi-matrix.
        """
        try:
            model_dict = loadmat(os.path.join(self.src_dir, local_filename))
            return model_dict
        except Exception as exc:
            logger.error("Unable to load mat file \n{}".format(str(exc)))
Beispiel #16
0
 def list_bucket_objects(self, prefix=None):
     """List all the objects in bucket."""
     try:
         if prefix:
             return self._s3.Bucket(
                 self.bucket_name).objects.filter(Prefix=prefix)
         else:
             return self._s3.Bucket(self.bucket_name).objects.filter()
     except Exception as exc:
         logger.error(
             "An Exception occurred while listing objects in bucket\n {}".
             format(str(exc)))
Beispiel #17
0
 def get_version(self, api_data):
     """Give the latest version for the package."""
     if api_data:
         try:
             latest_version = api_data['dist-tags']['latest']
             return latest_version
         except Exception:
             logger.info("Unable to fetch latest version from API data.")
             return ''
     else:
         logger.error("API Data is not available.")
         return ''
Beispiel #18
0
 def read_json_file(self, data_in_bytes):  # pragma: no cover
     """Read a big json file."""
     try:
         coded_data = data_in_bytes.decode('utf-8')
         io_data = io.StringIO(coded_data)
         json_data = io_data.readlines()
         data = list(map(json.loads, json_data))
         df = pd.DataFrame(data)
         return df
     except Exception:
         logger.error("Unable to read json file.")
         return self.df_
def get_file_content(url, session=None):
    """Customize get file content."""
    if session:
        session.timeout = 10
    if pip_download._scheme_re.search(url.decode() if not isinstance(url, str) else url):
        try:
            resp = session.get(url)
            resp.raise_for_status()
            return resp.content.decode()
        except Exception as _exc:
            logger.error('IGNORE: {}'.format(str(_exc)))
            return ''
    return url
Beispiel #20
0
 def read_yaml_file(self, filename):
     """Read Yaml file from the S3 bucket."""
     try:
         yaml = YAML()
         yaml_content = yaml.load(self.read_generic_file(filename))
         # convet to dict
         return json.loads(json.dumps(yaml_content))
     except ValueError:
         logger.error("Not a valid yaml file provided.")
     except Exception as exc:
         logger.error(
             "An Exception occurred while retrieving a yaml file \n{}".
             format(str(exc)))
Beispiel #21
0
 def read_json_file(self, filename):
     """Read JSON file from the S3 bucket."""
     try:
         utf_data = self.read_generic_file(filename)
         # python <= 3.5 requires string to load
         if isinstance(utf_data, (bytearray, bytes)):
             utf_data = utf_data.decode('utf-8')
         return json.loads(utf_data)
     except ValueError:
         logger.error("Not a valid json file provided.")
     except Exception as exc:
         logger.error(
             "An Exception occurred while retrieving a json file \n{}".
             format(str(exc)))
Beispiel #22
0
 def construct_packages(self, content):
     """Construct package from content."""
     if content:
         content = content.decode() if not isinstance(content, str) else content
         dependencies = {}
         try:
             decoded_json = demjson.decode(content)
         except Exception as _exc:
             logger.error("IGNORE {}".format(str(_exc)))
             decoded_json = self.handle_corrupt_packagejson(content)
         if decoded_json and isinstance(decoded_json, dict):
             dependencies = decoded_json.get('dependencies', {})
         return list(dependencies.keys() if isinstance(dependencies, dict) else [])
     return []
Beispiel #23
0
    def construct_job(self, input_dict):
        """Submit emr job."""
        required_fields = [
            'environment', 'data_version', 'bucket_name', 'github_repo'
        ]

        missing_fields = check_field_exists(input_dict, required_fields)

        if missing_fields:
            logger.error("Missing the parameters in input_dict",
                         extra={"missing_fields": missing_fields})
            raise ValueError(
                "Required fields are missing in the input {}".format(
                    missing_fields))

        self.env = input_dict.get('environment')
        self.data_version = input_dict.get('data_version')
        github_repo = input_dict.get('github_repo')
        if not check_url_alive(github_repo):
            logger.error(
                "Unable to find the github_repo {}".format(github_repo))
            raise ValueError(
                "Unable to find the github_repo {}".format(github_repo))
        self.training_repo_url = github_repo
        self.hyper_params = input_dict.get('hyper_params', '{}')
        aws_access_key = os.getenv("AWS_S3_ACCESS_KEY_ID") \
            or input_dict.get('aws_access_key')
        aws_secret_key = os.getenv("AWS_S3_SECRET_ACCESS_KEY")\
            or input_dict.get('aws_secret_key')
        github_token = os.getenv("GITHUB_TOKEN",
                                 input_dict.get('github_token'))
        self.bucket_name = input_dict.get('bucket_name')
        if self.hyper_params:
            try:
                self.hyper_params = json.dumps(input_dict.get('hyper_params'),
                                               separators=(',', ':'))
            except Exception:
                logger.error(
                    "Invalid hyper params",
                    extra={"hyper_params": input_dict.get('hyper_params')})

        self.properties = {
            'AWS_S3_ACCESS_KEY_ID': aws_access_key,
            'AWS_S3_SECRET_ACCESS_KEY': aws_secret_key,
            'AWS_S3_BUCKET_NAME': self.bucket_name,
            'MODEL_VERSION': self.data_version,
            'DEPLOYMENT_PREFIX': self.env,
            'GITHUB_TOKEN': github_token
        }

        self.aws_emr = AmazonEmr(aws_access_key_id=aws_access_key,
                                 aws_secret_access_key=aws_secret_key)

        self.aws_emr_client = self.aws_emr.connect()

        if not self.aws_emr.is_connected():
            logger.error("Unable to connect to emr instance.")
            raise ValueError

        logger.info("Successfully connected to emr instance.")
Beispiel #24
0
 def s3_delete_objects(self, object_keys):
     """Delete a object in bucket."""
     try:
         if not isinstance(object_keys, list):
             raise ValueError("Expected {}, got {}".format(
                 type(list()), type(object_keys)))
         return self._s3.Bucket(self.bucket_name).delete_objects(
             Delete={"Objects": [{
                 'Key': k
             } for k in object_keys]})
     except Exception as exc:
         logger.error(
             "An Exception occurred while deleting objects \n {}".format(
                 str(exc)))
Beispiel #25
0
    def load_matlab_multi_matrix(self, s3_path):
        """Load a '.mat'file & return a dict representation.

        :s3_path: The path of the object in the S3 bucket.
        :returns: A dict containing numpy matrices against the keys of the
                  multi-matrix.
        """
        local_filename = os.path.join('/tmp', s3_path.split('/')[-1])
        self._s3.Bucket(self.bucket_name).download_file(
            s3_path, local_filename)
        model_dict = loadmat(local_filename)
        if not model_dict:
            logger.error("Unable to load the model for scoring")
        return model_dict
Beispiel #26
0
 def from_npm_registry(self, package):
     """Find the keywords from NPM registry(through api)."""
     data_dict = self.dict_
     api_url = "https://registry.npmjs.org/" + str(package)
     try:
         api_data = requests.get(api_url).text
         json_data = json.loads(api_data)
         data_dict['name'] = json_data.get('name', '')
         data_dict['description'] = json_data.get('description', '')
         data_dict['keywords'] = json_data.get('keywords', [])
         data_dict['dependencies'] = self.get_dependencies(json_data)
         return data_dict
     except Exception:
         logger.error("Can't fetch the keywords from NPM Registry")
         return data_dict
Beispiel #27
0
 def make_kwd_dependencies_df(self, data_df, unique_packages):
     """Create Keyword Dependencies Dataframe."""
     keyword_df = self.df_
     dependencies_df = self.df_
     try:
         keyword_df = data_df.loc[data_df['name'].isin(unique_packages),
                                  ['name', 'keywords']]
     except Exception:
         logger.error("Keyword is not present.")
     try:
         dependencies_df = data_df.loc[
             data_df['name'].isin(unique_packages),
             ['name', 'dependencies']]
     except Exception:
         logger.error("Dependencies are not present. ")
     return keyword_df, dependencies_df
Beispiel #28
0
 def save_on_s3(self, folder_path):
     """Store all the contents on S3."""
     try:
         if os.path.exists(folder_path):
             if 'intermediate-model' in folder_path:
                 self.s3_client.s3_upload_folder(folder_path=folder_path,
                                                 prefix=self.version_name + '/intermediate-model'
                                                 )
             else:
                 self.s3_client.s3_upload_folder(folder_path=folder_path,
                                                 prefix=self.version_name + '')
             logger.info("Folders are successfully saved on S3.")
         else:
             logger.error("Folder path doesn't exist.")
     except Exception as e:
         raise e
def get_training_file_url(user, repo, branch='master', training_file_path='training/train.py'):
    """Get the training file from the github repo."""
    if not user and not repo:
        logger.error("Please provide the github user and repo",
                     extra={"user": user, "repo": repo})
        raise ValueError("Please provide the github user:{} and repo:{}"
                         .format(user, repo))

    file_url = urljoin(GITHUB_CONTENT_BASEURL,
                       '/'.join((user, repo, branch,
                                 training_file_path)))

    if not check_url_alive(file_url):
        logger.error("unable to reach the github training file path",
                     extra={'github_url': file_url})
        raise ValueError("Could not able to fetch training file")
    return file_url
def parse_requirements(content, session=PipSession(), *args, **kwargs):
    """Customize pip parse_requirements."""
    _content = get_file_content(content, session=session)

    lines_enum = pip_req.preprocess(_content, None)

    for line_number, line in lines_enum:
        try:
            req_iter = pip_req.process_line(line, 'requirements.txt', line_number, None,
                                            None, None, session, None,
                                            use_pep517=None, constraint=None)
            for req in req_iter:
                if req.name:
                    yield normalize_name(req.name)
        except Exception as _exc:
            logger.error('IGNORE: {} T(EXC):{} T(con):{}'
                         .format(str(_exc), type(_exc), type(content)))
            logger.error('IGNORE CONTENT: {}'.format(content))