def __load(self, use_cache): """ Load the data as embeddings """ cache_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl') if use_cache and os.path.isfile(cache_path): return path_lib.read_cache(cache_path) print(f'\nloading data from {self.__competitor_path} ...') with open(self.__competitor_path, 'rb') as f: tmp = json.load(f) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] data = [] print('loading sentence bert to generate embeddings ...') from sentence_transformers import SentenceTransformer self.__sentence_bert = SentenceTransformer( 'bert-large-nli-stsb-mean-tokens') # converting the raw data to features that we need for linkedin_name, linkedin_val in d_linkedin_name_2_linkedin_val.items( ): # get features feature = self.__choose_features(linkedin_val) data.append([feature, linkedin_name]) print('writing cache ...') path_lib.cache(cache_path, data) print('finish loading ') return data
import numpy as np from scipy.spatial.distance import cdist from lib import path_lib from config.path import VERSION # load embeddings pkl_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl') company_embeddings = path_lib.read_cache(pkl_path) X, names = list(zip(*company_embeddings)) X = np.array(X) names = np.array(names) # format embeddings d_linkedin_name_2_embeddings = {} for i, embedding in enumerate(X): d_linkedin_name_2_embeddings[names[i]] = embedding # save results path_lib.cache( path_lib.get_relative_file_path( 'runtime', 'processed_input', f'd_linkedin_name_2_embeddings_{VERSION}.pkl'), d_linkedin_name_2_embeddings) # load the data json_path = path_lib.get_relative_file_path( 'runtime', f'competitor_linkedin_dict_format_{VERSION}.json') tmp = path_lib.load_json(json_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[
def __load(self, use_cache): """ Load the data as embeddings """ cache_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'neg_rate_{self.__negative_rate}_start_{self.__start_ratio}_end_{self.__end_ratio}.pkl' ) if use_cache and os.path.isfile(cache_path): return path_lib.read_cache(cache_path) print(f'\nloading data from {self.__competitor_path} ...') with open(self.__competitor_path, 'rb') as f: tmp = json.load(f) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[ 'd_min_linkedin_name_max_linkedin_name'] self.__d_linkedin_name_2_embedding = path_lib.load_pkl( self.__embedding_path) print('splitting dataset ...') name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys()) name_pairs.sort() total_pairs = len(name_pairs) start_index = int(total_pairs * self.__start_ratio) end_index = int(total_pairs * self.__end_ratio) name_pairs = name_pairs[start_index:end_index] names = list(d_linkedin_name_2_linkedin_val.keys()) print( 'generating the positive and negative competitor relationships ... ' ) data = [] print( 'generating the positive and negative competitor relationships ... ' ) for min_name_max_name in name_pairs: name_1, name_2 = min_name_max_name.split('____') # get features feature_1 = self.__choose_features( name_1, d_linkedin_name_2_linkedin_val[name_1]) feature_2 = self.__choose_features( name_2, d_linkedin_name_2_linkedin_val[name_2]) # add positive competitor relationship data.append([feature_1, feature_2, 1, name_1, name_2]) # add negative competitor relationship for i in range(int(self.__negative_rate * 2)): if random.randint(0, 1) == 0: # randomly choose negative competitor relationship name_2_neg = self.__random_choose( names, name_1, d_min_linkedin_name_max_linkedin_name) feature_2_neg = self.__choose_features( name_2_neg, d_linkedin_name_2_linkedin_val[name_2_neg]) data.append( [feature_1, feature_2_neg, 0, name_1, name_2_neg]) else: # randomly choose negative competitor relationship name_1_neg = self.__random_choose( names, name_2, d_min_linkedin_name_max_linkedin_name) feature_1_neg = self.__choose_features( name_1_neg, d_linkedin_name_2_linkedin_val[name_1_neg]) data.append( [feature_1_neg, feature_2, 0, name_1_neg, name_2]) print('shuffling the data ...') random.shuffle(data) print('writing cache ...') path_lib.cache(cache_path, data) print('finish loading ') return data
def __load(self, use_cache): """ Load the data as embeddings """ cache_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'neg_rate_{self.__negative_rate}_v2.pkl') if use_cache and os.path.isfile(cache_path): return path_lib.read_cache(cache_path) print(f'\nloading data from {self.__competitor_path} ...') tmp = path_lib.load_json(self.__competitor_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[ 'd_min_linkedin_name_max_linkedin_name'] names = list(d_linkedin_name_2_linkedin_val.keys()) print( 'generating the positive and negative competitor relationships ... ' ) data = [] print('loading sentence bert to generate embeddings ...') from sentence_transformers import SentenceTransformer self.__sentence_bert = SentenceTransformer( 'bert-large-nli-stsb-mean-tokens') for min_name_max_name in d_min_linkedin_name_max_linkedin_name.keys(): name_1, name_2 = min_name_max_name.split('____') # get features feature_1 = self.__choose_features( d_linkedin_name_2_linkedin_val[name_1]) feature_2 = self.__choose_features( d_linkedin_name_2_linkedin_val[name_2]) # add positive competitor relationship data.append([feature_1, feature_2, 1, name_1, name_2]) # add negative competitor relationship for i in range(self.__negative_rate): # randomly choose negative competitor relationship name_2_neg = self.__random_choose( names, name_1, d_min_linkedin_name_max_linkedin_name) feature_2_neg = self.__choose_features( d_linkedin_name_2_linkedin_val[name_2_neg]) # randomly choose negative competitor relationship name_1_neg = self.__random_choose( names, name_2, d_min_linkedin_name_max_linkedin_name) feature_1_neg = self.__choose_features( d_linkedin_name_2_linkedin_val[name_1_neg]) data.append([feature_1, feature_2_neg, 0, name_1, name_2_neg]) data.append([feature_1_neg, feature_2, 0, name_1_neg, name_2]) print('shuffling the data ...') random.shuffle(data) print('writing cache ...') path_lib.cache(cache_path, data) print('finish loading ') return data