Example #1
0
    def construct_job(self, input_dict):
        """Submit emr job."""
        required_fields = [
            'environment', 'data_version', 'bucket_name', 'github_repo'
        ]

        missing_fields = check_field_exists(input_dict, required_fields)

        if missing_fields:
            logger.error("Missing the parameters in input_dict",
                         extra={"missing_fields": missing_fields})
            raise ValueError(
                "Required fields are missing in the input {}".format(
                    missing_fields))

        self.env = input_dict.get('environment')
        self.data_version = input_dict.get('data_version')
        github_repo = input_dict.get('github_repo')
        if not check_url_alive(github_repo):
            logger.error(
                "Unable to find the github_repo {}".format(github_repo))
            raise ValueError(
                "Unable to find the github_repo {}".format(github_repo))
        self.training_repo_url = github_repo
        self.hyper_params = input_dict.get('hyper_params', '{}')
        aws_access_key = os.getenv("AWS_S3_ACCESS_KEY_ID") \
            or input_dict.get('aws_access_key')
        aws_secret_key = os.getenv("AWS_S3_SECRET_ACCESS_KEY")\
            or input_dict.get('aws_secret_key')
        github_token = os.getenv("GITHUB_TOKEN",
                                 input_dict.get('github_token'))
        self.bucket_name = input_dict.get('bucket_name')
        if self.hyper_params:
            try:
                self.hyper_params = json.dumps(input_dict.get('hyper_params'),
                                               separators=(',', ':'))
            except Exception:
                logger.error(
                    "Invalid hyper params",
                    extra={"hyper_params": input_dict.get('hyper_params')})

        self.properties = {
            'AWS_S3_ACCESS_KEY_ID': aws_access_key,
            'AWS_S3_SECRET_ACCESS_KEY': aws_secret_key,
            'AWS_S3_BUCKET_NAME': self.bucket_name,
            'MODEL_VERSION': self.data_version,
            'DEPLOYMENT_PREFIX': self.env,
            'GITHUB_TOKEN': github_token
        }

        self.aws_emr = AmazonEmr(aws_access_key_id=aws_access_key,
                                 aws_secret_access_key=aws_secret_key)

        self.aws_emr_client = self.aws_emr.connect()

        if not self.aws_emr.is_connected():
            logger.error("Unable to connect to emr instance.")
            raise ValueError

        logger.info("Successfully connected to emr instance.")
Example #2
0
 def connect(self):
     """Connect to the S3 database."""
     try:
         session = boto3.session.Session(
             aws_access_key_id=self._aws_access_key_id,
             aws_secret_access_key=self._aws_secret_access_key,
             region_name=self.region_name)
         # signature version is needed to connect to new regions which support only v4
         if self._local_dev:
             self._s3 = session.resource(
                 's3',
                 config=botocore.client.Config(signature_version='s3v4'),
                 use_ssl=self._use_ssl,
                 endpoint_url=self._endpoint_url)
         else:
             self._s3 = session.resource(
                 's3',
                 config=botocore.client.Config(signature_version='s3v4'),
                 use_ssl=self._use_ssl)
         logger.info("Conneting to the s3")
         return self._s3
     except Exception as exc:
         logger.info(
             "An Exception occurred while establishing a AmazonS3 connection {}"
             .format(str(exc)))
Example #3
0
    def run_job(self, input_dict):
        """Run the emr job."""
        self.construct_job(input_dict)
        name = '{}_{}_training_{}'.format(self.env, self.ecosystem,
                                          self.current_time)

        bootstrap_uri = 's3://{bucket}/bootstrap.sh'.format(
            bucket=self.bucket_name)

        log_file_name = '{}.log'.format(name)

        log_uri = 's3://{bucket}/{log_file}'.format(
            bucket='{}-automated-analytics-spark-jobs'.format(self.env),
            log_file=log_file_name)

        emr_config_obj = EMRConfig(name=name,
                                   s3_bootstrap_uri=bootstrap_uri,
                                   training_repo_url=self.training_repo_url,
                                   log_uri=log_uri,
                                   ecosystem=self.ecosystem,
                                   properties=self.properties,
                                   hyper_params=self.hyper_params)

        configs = emr_config_obj.get_config()
        status = self.aws_emr.run_flow(configs)
        logger.info("EMR job is running {}".format(status))
        status_code = status.get('ResponseMetadata', {}).get('HTTPStatusCode')
        if status_code != 200:
            logger.error(
                "EMR Job Failed with the status code {}".format(status_code),
                extra={"status": status})
        return status
Example #4
0
 def from_github(self, package, url_df, api_url, api_token):
     """Find the keywords from the Github Graph QL."""
     url_ = self.utility.get_url(url_df, package)
     keywords = list()
     if type(url_) == str:
         query_params = self.utility.get_query_params(url_)
         logger.info("Query Parameters are: {}, {}".format(
             query_params[0], query_params[1]))
         json = {
             'query':
             '{{organization(login: "******"){{name url repository(name: "{1}")\
             {{name url description repositoryTopics(first: 10){{nodes{{topic {{name}}}}}}}}}}}}'
             .format(str(query_params[0]), str(query_params[1]))
         }
         headers = {'Authorization': 'token %s' % api_token}
         try:
             response = requests.post(url=api_url,
                                      json=json,
                                      headers=headers)
             keywords = list(self.clean_response(response.json()))
             return keywords
         except Exception:
             logger.error(
                 "Either Github token is not present or response is not coming."
             )
             return keywords
     else:
         return keywords
Example #5
0
 def split_training_testing_data(self):
     """Split data into training and testing."""
     data_in_bytes = self.load_user_item_data()
     data = data_in_bytes.decode("utf-8")
     data_list = data.split('\n')
     pairs_train = []
     pairs_test = []
     user_id = 0
     np.random.seed(int(time.time()))
     logger.info("Splitting data into training and testing.")
     for line in data_list:
         arr = line.strip().split()
         arr = np.asarray([int(x) for x in arr[1:]])
         n = len(arr)
         idx = np.random.permutation(n)
         for i in range(min(self.num_train_per_user, n)):
             pairs_train.append((user_id, arr[idx[i]]))
         if n > self.num_train_per_user:
             for i in range(self.num_train_per_user, n):
                 pairs_test.append((user_id, arr[idx[i]]))
         user_id += 1
     num_users = user_id
     pairs_train = np.asarray(pairs_train)
     pairs_test = np.asarray(pairs_test)
     num_items = np.maximum(np.max(pairs_train[:, 1]), np.max(pairs_test[:, 1])) + 1
     logger.info("Number of users and items are respectively {},"
                 " {}".format(num_users, num_items))
     return [pairs_train, pairs_test, num_users, num_items]
    def update_s3_bucket(self, data, bucket_name, filename='collated.json'):
        """Upload s3 bucket."""
        if self.s3_client is None:
            # creat s3 client if not exists.
            self.s3_client = AmazonS3(
                bucket_name=bucket_name,
                aws_access_key_id=os.getenv('AWS_S3_ACCESS_KEY_ID'),
                aws_secret_access_key=os.getenv('AWS_S3_SECRET_ACCESS_KEY'))
        # connect after creating or with existing s3 client
        self.s3_client.connect()
        if not self.s3_client.is_connected():
            raise ValueError("Unable to connect to s3.")

        json_data = dict()

        if self.s3_client.object_exists(filename):
            logger.info("{} exists, updating it.".format(filename))
            json_data = self.s3_client.read_json_file(filename)
            if not json_data:
                raise ValueError(
                    "Unable to get the json data path:{}/{}".format(
                        bucket_name, filename))

        json_data.update(data)
        self.s3_client.write_json_file(filename, json_data)
        logger.info("Updated file Succefully!")
Example #7
0
 def load_s3(self):
     """Establish the connection with S3."""
     self.s3_object.connect()
     if self.s3_object.is_connected():
         logger.info("S3 connection established.")
         return self.s3_object
     else:
         raise Exception
Example #8
0
 def check_path(self, path):
     """Check the given datastore path."""
     logger.info("Given path is: {}".format(path))
     try:
         if not os.path.exists(path):
             os.makedirs(path)
         return path
     except Exception as e:
         raise e
Example #9
0
    def save_numpy_matrix_temporary(self, content, filename, datastore):
        """Store numpy matrix in temporary storage."""
        path = self.check_path(datastore)
        try:
            np.savez(os.path.join(path, filename), matrix=content)
            logger.info("Numpy matrix has been stored successfully.")

        except Exception as e:
            raise e
Example #10
0
 def save_json_file_temporary(self, content, filename, datastore):
     """Store JSON file in temporary storage."""
     path = self.check_path(datastore)
     try:
         with open(os.path.join(path, filename), 'w') as f:
             json.dump(content, f)
         logger.info("JSON file has been stored successfully.")
     except Exception as e:
         raise e
def get_github_repo_info(repo_url):
    """Get the github repository information."""
    logger.info("Received repository for the information",
                extra={'github_url': repo_url})
    if repo_url.endswith('.git'):
        repo_url = repo_url[:-len('.git')]
    user, repo = repo_url.split('/')[-2:]
    user = user.split(':')[-1]
    return user, repo
 def from_existing_df(self, df_, package):
     """Find the keywords from existing dump."""
     if not df_.empty:
         data_lst = df_.loc[df_['name'] == str(package),
                            ['name', 'description', 'keywords', 'dependencies']]
         if not data_lst.empty:
             return data_lst.iloc[0]
     else:
         logger.info("Node Package details Dataframe is not existed.")
         return self.df_
Example #13
0
 def s3_clean_bucket(self):
     """Clean the bucket."""
     try:
         all_keys = self.list_bucket_keys()
         self.s3_delete_objects(all_keys)
         logger.info("`{}` bucket has been cleaned.".format(
             self.bucket_name))
     except Exception as exc:
         logger.error(
             "An Exception occurred while cleaning the bucket\n {}".format(
                 str(exc)))
def check_url_alive(url, accept_codes=[401]):
    """Validate github repo exist or not."""
    try:
        logger.info("checking url is alive", extra={"url": url})
        response = request.urlopen(url)
        status_code = response.getcode()
        if status_code in accept_codes or status_code // 100 in (2, 3):
            return True
    except Exception as exc:
        logger.debug("Unable to reach url", extra={"exception": str(exc)})
    return False
Example #15
0
    def save_manifest_file_temporary(self, content, filename, datastore):
        """Store manifest file in temporary storage."""
        path = self.check_path(datastore)
        try:
            with open(os.path.join(path, filename), 'w') as f:
                for lst in content:
                    f.write("{} {}\n".format(lst[0], " ".join(str(x) for x in lst[1:])))
            logger.info("Manifest File has been stored successfully.")

        except Exception as e:
            raise e
Example #16
0
 def load_raw_data(self):
     """Load the raw data from S3 bucket."""
     NPM_raw_data_path = os.path.join(self.version_name,
                                      "data/manifest.json")
     try:
         raw_data_dict_ = self.s3_client.read_json_file(NPM_raw_data_path)
         logger.info("Size of Raw Manifest file is: {}".format(
             len(raw_data_dict_)))
         return raw_data_dict_
     except Exception:
         raise Exception
Example #17
0
 def get_version(self, api_data):
     """Give the latest version for the package."""
     if api_data:
         try:
             latest_version = api_data['dist-tags']['latest']
             return latest_version
         except Exception:
             logger.info("Unable to fetch latest version from API data.")
             return ''
     else:
         logger.error("API Data is not available.")
         return ''
 def get_result(self, job_id=None, job_query_obj=None):
     """Get the result of the job."""
     if job_id is None:
         job_query_obj = job_query_obj or self.job_query_obj
         for row in job_query_obj.result():
             yield ({k: v for k, v in row.items()})
     else:
         job_obj = self.client.get_job(job_id)
         while job_obj.state == 'PENDING':
             job_obj = self.client.get_job(job_id)
             logger.info("Job State for Job Id:{} is {}".format(
                 job_id, job_obj.state))
             time.sleep(_POLLING_DELAY)
         yield from self.get_result(job_query_obj=job_obj)
Example #19
0
 def load_existing_data(self):
     """Load the node registry dump from S3 bucket."""
     NPM_clean_json_data_path = os.path.join(
         self.version_name, "data/node-package-details-with-url.json")
     try:
         logger.info("Path Existed")
         existing_data = self.s3_client.read_generic_file(
             NPM_clean_json_data_path)
         existing_df = self.utility.read_json_file(existing_data)
         logger.info("Size of Raw df with url is: {}".format(
             len(existing_df)))
         return existing_df
     except Exception:
         raise Exception("S3 connection error")
Example #20
0
 def load_existing_data(self):
     """Load the node registry dump from S3 bucket."""
     NPM_clean_json_data_path = os.path.join("training-utils",
                                             "node-package-details.json")
     if self.s3_client.object_exists(NPM_clean_json_data_path):
         try:
             logger.info("Reading dump data from training-utils folder.")
             existing_data = self.s3_client.read_json_file(NPM_clean_json_data_path)
             logger.info("Size of raw json: %d", len(existing_data))
             return existing_data
         except Exception:
             raise Exception("S3 connection error")
     else:
         raise ValueError("Given Path is not present.")
Example #21
0
 def get_dependencies(self, api_data):
     """Give the dependencies for latest version of package."""
     version = self.get_version(api_data)
     logger.info("Latest_version is: {}".format(version))
     versions_dict = api_data.get('versions', dict())
     try:
         if versions_dict:
             latest_version_data_dict = versions_dict.get(version, dict())
             if latest_version_data_dict:
                 latest_dependencies = latest_version_data_dict.get(
                     'dependencies', list())
                 return list(latest_dependencies.keys())
     except Exception:
         return list()
 def load_existing_data(self):
     """Load the node registry dump from S3 bucket."""
     NPM_clean_json_data_path = os.path.join("training-utils",
                                             "node-package-details-with-url.json")
     if self.s3_client.object_exists(NPM_clean_json_data_path):
         try:
             logger.info("Reading dump data from training-utils folder.")
             existing_data = self.s3_client.read_generic_file(NPM_clean_json_data_path)
             existing_df = self.utility.read_json_file(existing_data)
             logger.info("Size of Raw df with url is: {}".format(len(existing_df)))
             return existing_df
         except Exception:
             raise Exception("S3 connection error")
     else:
         raise ValueError("Given Path is not present.")
Example #23
0
 def handle_response(self):
     """Process and get the response of async requests."""
     results = list()
     for resp in self.responses:
         pkg_name, req_obj = resp
         if isinstance(req_obj, int):
             if req_obj == 200:
                 results.append(pkg_name)
         elif req_obj.status_code == 200:
             results.append(pkg_name)
             logger.info("Received status:{} for pkg:{}".format(
                 req_obj.status_code, pkg_name))
         else:
             logger.info("Received status:{} for pkg:{}".format(
                 req_obj.status_code, pkg_name))
     return results
Example #24
0
 def save_on_s3(self, folder_path):
     """Store all the contents on S3."""
     try:
         if os.path.exists(folder_path):
             if 'intermediate-model' in folder_path:
                 self.s3_client.s3_upload_folder(folder_path=folder_path,
                                                 prefix=self.version_name + '/intermediate-model'
                                                 )
             else:
                 self.s3_client.s3_upload_folder(folder_path=folder_path,
                                                 prefix=self.version_name + '')
             logger.info("Folders are successfully saved on S3.")
         else:
             logger.error("Folder path doesn't exist.")
     except Exception as e:
         raise e
Example #25
0
 def make_user_data(self, manifest_list, unique_packages):
     """Return the user data, which is required for making test data."""
     manifest_user_data = list()
     logger.info("Length of manifest list is: {}".format(
         len(manifest_list)))
     logger.info("Length of Unique Packages are: {}".format(
         len(unique_packages)))
     if unique_packages:
         pkg_idx_map = self.create_package_map(unique_packages)
         for manifest in manifest_list:
             this_user_items = [pkg_idx_map[pkg] for pkg in manifest]
             this_user_items = [str(x) for x in this_user_items]
             length_ = len(this_user_items)
             user_items = [str(length_)] + this_user_items
             manifest_user_data.append(user_items)
     return list(manifest_user_data)
Example #26
0
    def connect(self):
        """Connect to the emr instance."""
        try:
            session = boto3.session.Session(
                aws_access_key_id=self._aws_access_key_id,
                aws_secret_access_key=self._aws_secret_access_key,
                region_name=self.region_name)

            self._emr = session.client(
                'emr',
                config=botocore.client.Config(signature_version='s3v4'),
                use_ssl=self._use_ssl)
            logger.info("Connecting to the emr")
        except Exception as exc:
            logger.info(
                "An Exception occurred while establishing a AmazonEMR connection {}"
                .format(str(exc)))
Example #27
0
    def create_content_matrix(self, pkg_tag_map, all_packages,
                              vocabulary):  # pragma: no cover
        """Create Content Matrix."""
        tag_idx_map = self.create_vocabulary_map(vocabulary)
        content_matrix = np.zeros([len(all_packages), len(vocabulary)])
        if tag_idx_map:
            for idx, package in enumerate(all_packages):
                try:
                    package_tags = [
                        tag_idx_map[tag] for tag in pkg_tag_map[package]
                    ]
                    if idx == 0:
                        logger.info("Setting to 1: {}".format(package_tags))
                    content_matrix[idx, package_tags] = 1
                except KeyError:
                    continue

        return content_matrix
Example #28
0
    def is_fetch_done(self, callback=lambda x: x):
        """Check whether all the requests are processed or not."""
        _flag = True
        for resp in self.process_queue:
            _flag = False
            others, url, req_obj = resp
            logger.info("other:{}, url:{}, req_obj:{}".format(others, url, req_obj))

            if url in self.cache:
                req_obj.cancel()
                self.process_queue.remove(resp)
                self.responses.append(self.cache[url])
            elif req_obj.done():
                req_obj.cancel()
                self.process_queue.remove(resp)
                self.cache[url] = (others, callback(req_obj))
                self.responses.append((others, callback(req_obj)))
        return _flag
Example #29
0
 def preprocess_data(self):
     """Preprocesses the data and save into temporary storage."""
     package_tag_map, vocabulary, manifest_user_data, unique_packages = \
         self.preprocess_data_obj.update_pkg_tag_map()
     package_tag_map = {k: list(v) for k, v in package_tag_map.items()}
     self.obj_.save_manifest_file_temporary(manifest_user_data,
                                            'manifest_user_data.dat',
                                            TEMPORARY_DATA_PATH)
     package_id_map = self.utils.create_package_map(unique_packages)
     id_package_map = dict(
         zip(range(len(unique_packages)), list(unique_packages)))
     user_train_data, item_train_data, user_test_data, item_test_data = \
         self.obj_.train_test_data()
     content_matrix = self.utils.create_content_matrix(
         package_tag_map, unique_packages, vocabulary)
     self.obj_.save_json_file_temporary(package_id_map,
                                        'package_to_index_map.json',
                                        TEMPORARY_PATH)
     self.obj_.save_json_file_temporary(id_package_map,
                                        'index_to_package_map.json',
                                        TEMPORARY_PATH)
     self.obj_.save_json_file_temporary(package_tag_map,
                                        'package_tag_map.json',
                                        TEMPORARY_PATH)
     self.obj_.save_file_temporary(
         user_train_data,
         "packagedata-train-" + str(self.num_users) + "-users.dat",
         TEMPORARY_DATA_PATH)
     self.obj_.save_file_temporary(
         user_test_data,
         "packagedata-test-" + str(self.num_users) + "-users.dat",
         TEMPORARY_DATA_PATH)
     self.obj_.save_file_temporary(
         item_train_data,
         "packagedata-train-" + str(self.num_users) + "-items.dat",
         TEMPORARY_DATA_PATH)
     self.obj_.save_file_temporary(
         item_test_data,
         "packagedata-test-" + str(self.num_users) + "-items.dat",
         TEMPORARY_DATA_PATH)
     self.obj_.save_numpy_matrix_temporary(content_matrix,
                                           'content_matrix.npz',
                                           TEMPORARY_DATA_PATH)
     logger.info("All items are saved successfully in temporary location.")
 def find_keywords(self, df_, list_):
     """Find the keywords for given list of list of raw data."""
     package_lst = self.utility.flatten_list(list_)
     out_lst = list()
     for i in package_lst:
         pkg_kwd_lst = list()
         pkg_kwd_lst = self.utility.make_list_from_series(
             self.from_existing_df(df_, i))
         if not pkg_kwd_lst or type(pkg_kwd_lst[2]) != list:
             logger.info("Finding from the NPM repository.")
             pkg_kwd_dict = self.from_npm_registry(i)
             pkg_kwd_lst = list(pkg_kwd_dict.values())
             if len(pkg_kwd_lst[2]) == 0:
                 logger.info("Trying to fetch from Github")
                 api_url = 'https://api.github.com/graphql'
                 api_token = self.get_data.github_token
                 pkg_kwd_lst[2] = self.from_github(i, df_, api_url, api_token)
         out_lst.append(pkg_kwd_lst)
     return pd.DataFrame(out_lst, columns=['name', 'description', 'keywords', 'dependencies'])