def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, aws_bucket_name='cvae-insights', local_data_store=False, deployment_prefix='dev', model_version='2019-01-03'): """Create an instance of GetData.""" self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', '') self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY', '') self.github_token = os.environ.get('GITHUB_TOKEN', '') self.bucket_name = aws_bucket_name self.deployment_prefix = deployment_prefix self.version_name = model_version if local_data_store: self.s3_client = LocalDataStore('tests/test_data') else: self.s3_object = AmazonS3( bucket_name=self.bucket_name, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key) self.s3_client = self.load_S3() self.utility = Utility()
def __init__(self, data_obj, df_=pd.DataFrame()): """Create an instance for PreprocessData.""" self.get_data_obj = data_obj self.utility_obj = Utility() self.get_keywords_obj = GetKeywords(data_obj) self.df_ = df_ self.existing_data = self.get_data_obj.load_existing_data() self.pkg_kwd_df = self.fetch_package_keywords()
def __init__(self, df_=pd.DataFrame(), dict_=dict(), local_data_store=False): """Create an instance for GetKeywords.""" self.df_ = df_ self.dict_ = dict_ self.get_data = GetData(local_data_store=local_data_store) self.utility = Utility()
def __init__(self, aws_access_key_id='', aws_secret_access_key='', aws_bucket_name='cvae-insights', model_version='', num_train_per_user=5): """Create an instance for GetPreprocessData.""" self.obj_ = GetData(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_bucket_name=aws_bucket_name, model_version=model_version, num_train_per_user=num_train_per_user) self.keyword_obj_ = GetKeywords(self.obj_) self.preprocess_data_obj = PreprocessData(data_obj=self.obj_) self.utils = Utility() self.num_users = num_train_per_user
def __init__(self, aws_access_key_id, aws_secret_access_key, num_train_per_user, aws_bucket_name, model_version): """Create an instance of GetData.""" self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', aws_access_key_id) self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY', aws_secret_access_key) self.github_token = os.environ.get('GITHUB_TOKEN', '') self.bucket_name = aws_bucket_name self.version_name = model_version self.s3_object = AmazonS3(bucket_name=self.bucket_name, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key ) self.num_train_per_user = num_train_per_user self.s3_client = self.load_s3() self.utility = Utility()
class PreprocessData: """This class defines the PreprocessData functions.""" def __init__(self, data_obj, df_=pd.DataFrame()): """Create an instance for PreprocessData.""" self.get_data_obj = data_obj self.utility_obj = Utility() self.get_keywords_obj = GetKeywords(data_obj) self.df_ = df_ self.existing_data = self.get_data_obj.load_existing_data() self.pkg_kwd_df = self.fetch_package_keywords() def add_dependencies_resolved_column(self, df_, dependencies_list): """Return a binary value for dependency resoled column.""" dependencies = [dep.lower() for dep in dependencies_list] pkg_with_tags = df_.loc[df_['name'].isin(dependencies)] if len(pkg_with_tags) == 0: return 0 elif len(set(dependencies) - set(pkg_with_tags['name'])) == 0: return 1 else: return 0 def check_resolved_dependencies(self, df_): """Add a column all dependencies resolved and assign the binary value.""" df_['all_deps_resolved'] = [ self.add_dependencies_resolved_column(self.pkg_kwd_df, i) for i in df_['dependencies'] ] df_ = df_.loc[df_['all_deps_resolved'] == 0] return df_ def fetch_package_keywords(self): """Fetch the keywords for raw data's package list.""" raw_data = self.get_data_obj.load_raw_data() manifest_data = raw_data.get('package_dict', {}) all_manifest = manifest_data.get('user_input_stack', []) + \ manifest_data.get('bigquery_data', []) try: package_keyword_df = self.get_keywords_obj.find_keywords( self.existing_data, all_manifest) return package_keyword_df except Exception: raise ValueError("Unable to fetch keywords.") def make_necessary_df(self, limit_manifest, limit_keywords): """Create two dataframes for dependencies and keywords respectively..""" filtered_pkg_kwd_df = self.df_ manifest_df = self.df_ if 'dependencies' in self.pkg_kwd_df.columns: manifest_df = self.utility_obj.make_manifest_df( self.pkg_kwd_df, limit_manifest) else: raise KeyError("Dependency is not present") if 'keywords' in self.pkg_kwd_df.columns: filtered_pkg_kwd_df = self.utility_obj.make_filtered_pkg_kwd_df( self.pkg_kwd_df, limit_keywords) else: raise KeyError("Keywords are not present") return (list([manifest_df, filtered_pkg_kwd_df])) def extract_unique_packages(self): """Return all unique packages from filtered package keyword dataframe.""" filtered_pkg_kwd_df = self.make_necessary_df(5, 0)[1] data_with_dep_check = self.check_resolved_dependencies( filtered_pkg_kwd_df) unique_packages, manifest_user_data = self.utility_obj.extract_package_manifest_lst( data_with_dep_check) manifest_user_data = self.utility_obj.make_user_data( manifest_user_data, unique_packages) return unique_packages, manifest_user_data def create_df_and_dictionaries(self): """Create all the necessary dataframes and dictionaries.""" self.unique_packages, self.manifest_user_data = self.extract_unique_packages( ) self.keyword_df, self.dependencies_df = self.utility_obj.make_kwd_dependencies_df( self.pkg_kwd_df, self.unique_packages) self.package_tag_map, self.vocabulary = self.utility_obj.create_pkg_tag_map( self.keyword_df) self.package_dep_map, self.first_level_deps = self.utility_obj.create_pkg_dep_map( self.dependencies_df) def create_extended_pkg_tag_map(self): """Create the package tag map according to all first level dependencies.""" self.create_df_and_dictionaries() self.extended_ptm = dict() keywords_df_deps = self.pkg_kwd_df.loc[ self.pkg_kwd_df['name'].isin(self.first_level_deps), ['name', 'keywords']] for k, g in keywords_df_deps.groupby("name"): try: self.extended_ptm[k] = self.utility_obj.clean_set( self.package_dep_map.get(k, set()).union( set(g["keywords"].tolist()[0]))) except Exception: pass return self.extended_ptm, self.manifest_user_data, self.unique_packages def update_pkg_tag_map(self): """Update the existing package tag map.""" extended_ptm, manifest_user_data, unique_packages = self.create_extended_pkg_tag_map( ) for package_name in self.package_tag_map.keys(): more_keywords = set() for dependency in self.package_dep_map[package_name]: more_keywords = more_keywords.union( set(extended_ptm.get(dependency, []))) self.package_tag_map[package_name] = self.package_tag_map.get( package_name).union(more_keywords) self.vocabulary = self.vocabulary.union(more_keywords) return self.package_tag_map, self.vocabulary, manifest_user_data, unique_packages
class GetData: """This class defines the S3 Connections viz fetching and storing data.""" def __init__(self, aws_access_key_id, aws_secret_access_key, num_train_per_user, aws_bucket_name, model_version): """Create an instance of GetData.""" self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', aws_access_key_id) self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY', aws_secret_access_key) self.github_token = os.environ.get('GITHUB_TOKEN', '') self.bucket_name = aws_bucket_name self.version_name = model_version self.s3_object = AmazonS3(bucket_name=self.bucket_name, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key ) self.num_train_per_user = num_train_per_user self.s3_client = self.load_s3() self.utility = Utility() def load_s3(self): """Establish the connection with S3.""" self.s3_object.connect() if self.s3_object.is_connected(): logger.info("S3 connection established.") return self.s3_object else: raise Exception def load_raw_data(self): """Load the raw data from S3 bucket.""" NPM_raw_data_path = os.path.join(self.version_name, "data/manifest.json") logger.info("Reading raw data from {}".format(self.version_name)) if (self.s3_client.object_exists(NPM_raw_data_path)): try: raw_data_dict_ = self.s3_client.read_json_file(NPM_raw_data_path) logger.info("Size of Raw Manifest file is: {}".format(len(raw_data_dict_))) return raw_data_dict_ except Exception: raise Exception def load_existing_data(self): """Load the node registry dump from S3 bucket.""" NPM_clean_json_data_path = os.path.join("training-utils", "node-package-details-with-url.json") if self.s3_client.object_exists(NPM_clean_json_data_path): try: logger.info("Reading dump data from training-utils folder.") existing_data = self.s3_client.read_generic_file(NPM_clean_json_data_path) existing_df = self.utility.read_json_file(existing_data) logger.info("Size of Raw df with url is: {}".format(len(existing_df))) return existing_df except Exception: raise Exception("S3 connection error") else: raise ValueError("Given Path is not present.") def load_user_item_data(self): """Load the manifest file.""" NPM_manifest_user_data_path = os.path.join(TEMPORARY_PATH, "manifest_user_data.dat") try: with open(NPM_manifest_user_data_path, 'rb') as f: user_item_data = f.read() return user_item_data except Exception: raise Exception("S3 could not read the file.") def create_package_train_user_data(self): """Create package train user data.""" self.package_train_user_data = list() for user_id in range(self.num_users): this_user_items = self.pairs_train[self.pairs_train[:, 0] == user_id, 1] items_str = " ".join(str(x) for x in this_user_items) self.package_train_user_data.append([len(this_user_items), items_str]) return self.package_train_user_data def create_package_train_item_data(self): """Create package train item data.""" self.package_train_item_data = list() for item_id in range(self.num_items): this_item_users = self.pairs_train[self.pairs_train[:, 1] == item_id, 0] users_str = " ".join(str(x) for x in this_item_users) self.package_train_item_data.append([len(this_item_users), users_str]) return self.package_train_item_data def create_package_test_user_data(self): """Create package test user data.""" self.package_test_user_data = list() for user_id in range(self.num_users): this_user_items = self.pairs_test[self.pairs_test[:, 0] == user_id, 1] items_str = " ".join(str(x) for x in this_user_items) self.package_test_user_data.append([len(this_user_items), items_str]) return self.package_test_user_data def create_package_test_item_data(self): """Create package test item data.""" self.package_test_item_data = list() for item_id in range(self.num_items): this_item_users = self.pairs_test[self.pairs_test[:, 1] == item_id, 0] users_str = " ".join(str(x) for x in this_item_users) self.package_test_item_data.append([len(this_item_users), users_str]) return self.package_test_item_data def train_test_data(self): """Create the training testing data for PMF.""" data_list = self.split_training_testing_data() self.pairs_train = data_list[0] self.pairs_test = data_list[1] self.num_users = data_list[2] self.num_items = data_list[3] packagedata_train_users = self.create_package_train_user_data() packagedata_train_items = self.create_package_train_item_data() packagedata_test_users = self.create_package_test_user_data() packagedata_test_items = self.create_package_test_item_data() return packagedata_train_users, packagedata_train_items, \ packagedata_test_users, packagedata_test_items def split_training_testing_data(self): """Split data into training and testing.""" data_in_bytes = self.load_user_item_data() data = data_in_bytes.decode("utf-8") data_list = data.split('\n') pairs_train = [] pairs_test = [] user_id = 0 np.random.seed(int(time.time())) logger.info("Splitting data into training and testing.") for line in data_list: arr = line.strip().split() arr = np.asarray([int(x) for x in arr[1:]]) n = len(arr) idx = np.random.permutation(n) for i in range(min(self.num_train_per_user, n)): pairs_train.append((user_id, arr[idx[i]])) if n > self.num_train_per_user: for i in range(self.num_train_per_user, n): pairs_test.append((user_id, arr[idx[i]])) user_id += 1 num_users = user_id pairs_train = np.asarray(pairs_train) pairs_test = np.asarray(pairs_test) num_items = np.maximum(np.max(pairs_train[:, 1]), np.max(pairs_test[:, 1])) + 1 logger.info("Number of users and items are respectively {}," " {}".format(num_users, num_items)) return [pairs_train, pairs_test, num_users, num_items] def check_path(self, path): """Check the given datastore path.""" logger.info("Given path is: {}".format(path)) try: if not os.path.exists(path): os.makedirs(path) return path except Exception as e: raise e def save_file_temporary(self, content, filename, datastore): """Store data file in temporary storage.""" path = self.check_path(datastore) try: with open(os.path.join(path, filename), 'w') as f: for lst in content: ele_str = " ".join([str(x) for x in lst[1:]]) f.write("{} {}\n".format(lst[0], ele_str)) logger.info("File has been stored successfully.") except Exception as e: raise e def save_manifest_file_temporary(self, content, filename, datastore): """Store manifest file in temporary storage.""" path = self.check_path(datastore) try: with open(os.path.join(path, filename), 'w') as f: for lst in content: f.write("{} {}\n".format(lst[0], " ".join(str(x) for x in lst[1:]))) logger.info("Manifest File has been stored successfully.") except Exception as e: raise e def save_numpy_matrix_temporary(self, content, filename, datastore): """Store numpy matrix in temporary storage.""" path = self.check_path(datastore) try: np.savez(os.path.join(path, filename), matrix=content) logger.info("Numpy matrix has been stored successfully.") except Exception as e: raise e def save_json_file_temporary(self, content, filename, datastore): """Store JSON file in temporary storage.""" path = self.check_path(datastore) try: with open(os.path.join(path, filename), 'w') as f: json.dump(content, f) logger.info("JSON file has been stored successfully.") except Exception as e: raise e def save_on_s3(self, folder_path): """Store all the contents on S3.""" try: if os.path.exists(folder_path): if 'intermediate-model' in folder_path: self.s3_client.s3_upload_folder(folder_path=folder_path, prefix=self.version_name + '/intermediate-model' ) else: self.s3_client.s3_upload_folder(folder_path=folder_path, prefix=self.version_name + '') logger.info("Folders are successfully saved on S3.") else: logger.error("Folder path doesn't exist.") except Exception as e: raise e
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. """ import json import unittest from training.datastore.utils import Utility import pandas as pd utils_obj = Utility() with open('tests/test_data/npm/dev/2019-01-03/data/test-node-package-details-with-url.json') as f: test_data_df = pd.DataFrame(json.load(f)) class TestUtility(unittest.TestCase): """This class tests the Utility Class.""" def test_flatten_list(self): """Test Flatten list function.""" test_list_ = [[1, 2], [3, 4]] test_flatten_list_output = utils_obj.flatten_list(test_list_) assert type(test_flatten_list_output[0]) != list def test_make_list_from_series(self):
class GetKeywords: """This class defines the S3 Connections viz fetching and storing data.""" def __init__(self, df_=pd.DataFrame(), dict_=dict(), local_data_store=False): """Create an instance for GetKeywords.""" self.df_ = df_ self.dict_ = dict_ self.get_data = GetData(local_data_store=local_data_store) self.utility = Utility() def from_existing_df(self, df_, package): """Find the keywords from existing dump.""" if not df_.empty: data_lst = df_.loc[ df_['name'] == str(package), ['name', 'description', 'keywords', 'dependencies']].iloc[0] return data_lst else: logger.error("Node Package details Dataframe is not existed.") return self.df_ def from_npm_registry(self, package): """Find the keywords from NPM registry(through api).""" data_dict = self.dict_ api_url = "https://registry.npmjs.org/" + str(package) try: api_data = requests.get(api_url).text json_data = json.loads(api_data) data_dict['name'] = json_data.get('name', '') data_dict['description'] = json_data.get('description', '') data_dict['keywords'] = json_data.get('keywords', []) data_dict['dependencies'] = self.get_dependencies(json_data) return data_dict except Exception: logger.error("Can't fetch the keywords from NPM Registry") return data_dict def get_version(self, api_data): """Give the latest version for the package.""" if api_data: try: latest_version = api_data['dist-tags']['latest'] return latest_version except Exception: logger.info("Unable to fetch latest version from API data.") return '' else: logger.error("API Data is not available.") return '' def get_dependencies(self, api_data): """Give the dependencies for latest version of package.""" version = self.get_version(api_data) logger.info("Latest_version is: {}".format(version)) versions_dict = api_data.get('versions', dict()) try: if versions_dict: latest_version_data_dict = versions_dict.get(version, dict()) if latest_version_data_dict: latest_dependencies = latest_version_data_dict.get( 'dependencies', list()) return list(latest_dependencies.keys()) except Exception: return list() def clean_response(self, response_json): """Clean the api response json.""" topic_lst = response_json['data']['organization']['repository'][ 'repositoryTopics']['nodes'] topic_name_lst = [dict(i.get('topic')).get('name') for i in topic_lst] return list(topic_name_lst) def from_github(self, package, url_df, api_url, api_token): """Find the keywords from the Github Graph QL.""" url_ = self.utility.get_url(url_df, package) logger.info("Repo URL is {}".format(url_)) keywords = list() query_params = self.utility.get_query_params(url_) logger.info("Query Parameters are: {}, {}".format( query_params[0], query_params[1])) json = { 'query': '{{organization(login: "******"){{name url repository(name: "{1}")\ {{name url description repositoryTopics(first: 10){{nodes{{topic {{name}}}}}}}}}}}}' .format(str(query_params[0]), str(query_params[1])) } headers = {'Authorization': 'token %s' % api_token} try: response = requests.post(url=api_url, json=json, headers=headers) keywords = list(self.clean_response(response.json())) return keywords except Exception: logger.error("Github tokens are not present.") def find_keywords(self, df_, list_): """Find the keywords for given list of list of raw data.""" package_lst = self.utility.flatten_list(list_) out_lst = list() for i in package_lst: pkg_kwd_lst = list() pkg_kwd_lst = self.utility.make_list_from_series( self.from_existing_df(df_, i)) if not pkg_kwd_lst or type(pkg_kwd_lst[2]) != list: logger.info("Finding from the NPM repository.") pkg_kwd_dict = self.from_npm_registry(i) pkg_kwd_lst = list(pkg_kwd_dict.values()) if len(pkg_kwd_lst[2]) == 0: logger.info("Trying to fetch from Github") api_url = 'https://api.github.com/graphql' api_token = self.get_data.github_token pkg_kwd_lst[2] = self.from_github(i, df_, api_url, api_token) out_lst.append(pkg_kwd_lst) return pd.DataFrame( out_lst, columns=['name', 'description', 'keywords', 'dependencies'])
def __init__(self, df_=pd.DataFrame(), dict_=dict()): """Create an instance for GetKeywords.""" self.df_ = df_ self.dict_ = dict_ self.get_data = GetData() self.utility = Utility()
class GetData: """This class defines the S3 Connections viz fetching and storing data.""" def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, aws_bucket_name='cvae-insights', local_data_store=False, deployment_prefix='dev', model_version='2019-01-03'): """Create an instance of GetData.""" self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', '') self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY', '') self.github_token = os.environ.get('GITHUB_TOKEN', '') self.bucket_name = aws_bucket_name self.deployment_prefix = deployment_prefix self.version_name = model_version if local_data_store: self.s3_client = LocalDataStore('tests/test_data') else: self.s3_object = AmazonS3( bucket_name=self.bucket_name, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key) self.s3_client = self.load_S3() self.utility = Utility() def load_S3(self): """Establish the connection with S3.""" self.s3_object.connect() if self.s3_object.is_connected(): logger.info("S3 connection established.") return self.s3_object else: raise Exception def load_raw_data(self): """Load the raw data from S3 bucket.""" NPM_raw_data_path = os.path.join(self.version_name, "data/manifest.json") try: raw_data_dict_ = self.s3_client.read_json_file(NPM_raw_data_path) logger.info("Size of Raw Manifest file is: {}".format( len(raw_data_dict_))) return raw_data_dict_ except Exception: raise Exception def load_existing_data(self): """Load the node registry dump from S3 bucket.""" NPM_clean_json_data_path = os.path.join( self.version_name, "data/node-package-details-with-url.json") try: logger.info("Path Existed") existing_data = self.s3_client.read_generic_file( NPM_clean_json_data_path) existing_df = self.utility.read_json_file(existing_data) logger.info("Size of Raw df with url is: {}".format( len(existing_df))) return existing_df except Exception: raise Exception("S3 connection error")
class GetPreprocessData: """This class processes raw data and converts into the input data for models.""" def __init__(self, aws_access_key_id='', aws_secret_access_key='', aws_bucket_name='cvae-insights', model_version='', num_train_per_user=5): """Create an instance for GetPreprocessData.""" self.obj_ = GetData(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_bucket_name=aws_bucket_name, model_version=model_version, num_train_per_user=num_train_per_user) self.keyword_obj_ = GetKeywords(self.obj_) self.preprocess_data_obj = PreprocessData(data_obj=self.obj_) self.utils = Utility() self.num_users = num_train_per_user def preprocess_data(self): """Preprocesses the data and save into temporary storage.""" package_tag_map, vocabulary, manifest_user_data, unique_packages = \ self.preprocess_data_obj.update_pkg_tag_map() package_tag_map = {k: list(v) for k, v in package_tag_map.items()} self.obj_.save_manifest_file_temporary(manifest_user_data, 'manifest_user_data.dat', TEMPORARY_DATA_PATH) package_id_map = self.utils.create_package_map(unique_packages) id_package_map = dict( zip(range(len(unique_packages)), list(unique_packages))) user_train_data, item_train_data, user_test_data, item_test_data = \ self.obj_.train_test_data() content_matrix = self.utils.create_content_matrix( package_tag_map, unique_packages, vocabulary) self.obj_.save_json_file_temporary(package_id_map, 'package_to_index_map.json', TEMPORARY_PATH) self.obj_.save_json_file_temporary(id_package_map, 'index_to_package_map.json', TEMPORARY_PATH) self.obj_.save_json_file_temporary(package_tag_map, 'package_tag_map.json', TEMPORARY_PATH) self.obj_.save_file_temporary( user_train_data, "packagedata-train-" + str(self.num_users) + "-users.dat", TEMPORARY_DATA_PATH) self.obj_.save_file_temporary( user_test_data, "packagedata-test-" + str(self.num_users) + "-users.dat", TEMPORARY_DATA_PATH) self.obj_.save_file_temporary( item_train_data, "packagedata-train-" + str(self.num_users) + "-items.dat", TEMPORARY_DATA_PATH) self.obj_.save_file_temporary( item_test_data, "packagedata-test-" + str(self.num_users) + "-items.dat", TEMPORARY_DATA_PATH) self.obj_.save_numpy_matrix_temporary(content_matrix, 'content_matrix.npz', TEMPORARY_DATA_PATH) logger.info("All items are saved successfully in temporary location.")