def __init__(self, negative_rate=1, start_ratio=0.0, end_ratio=0.81, use_cache=True): self.__competitor_path = path_lib.get_relative_file_path('runtime', f'competitor_linkedin_dict_format_{VERSION}.json') self.__embedding_path = path_lib.get_relative_file_path( 'runtime', 'processed_input', f'd_linkedin_name_2_embeddings_{VERSION}.pkl') self.__negative_rate = negative_rate self.__start_ratio = start_ratio self.__end_ratio = end_ratio self.__data = self.__load(use_cache)
def new_line(output=False): with open(path_lib.get_relative_file_path('log', MODEL, f'{VARIANT}.log'), 'ab') as f: f.write('\n'.encode('utf-8')) if output: print('')
def __load(self, use_cache): """ Load the data as embeddings """ cache_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl') if use_cache and os.path.isfile(cache_path): return path_lib.read_cache(cache_path) print(f'\nloading data from {self.__competitor_path} ...') with open(self.__competitor_path, 'rb') as f: tmp = json.load(f) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] data = [] print('loading sentence bert to generate embeddings ...') from sentence_transformers import SentenceTransformer self.__sentence_bert = SentenceTransformer( 'bert-large-nli-stsb-mean-tokens') # converting the raw data to features that we need for linkedin_name, linkedin_val in d_linkedin_name_2_linkedin_val.items( ): # get features feature = self.__choose_features(linkedin_val) data.append([feature, linkedin_name]) print('writing cache ...') path_lib.cache(cache_path, data) print('finish loading ') return data
def new_paragraph(output=False): string = '\n\n----------------------------------------------\n' with open(path_lib.get_relative_file_path('log', MODEL, f'{VARIANT}.log'), 'ab') as f: f.write(string.encode('utf-8')) if output: print(string)
def statistic(): print(f'\nstatistic the shared competitors for competitors ... ') # record statistics shared_competitor_counts = [] # to remove duplicate statistic d_min_name_max_name_2_has_statistic = {} length = len(d_name_2_competitors) _i = 0 for _name_1, competitors in d_name_2_competitors.items(): if _i % 2 == 0: progress = float(_i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') _i += 1 for _j, _name_2 in enumerate(list(competitors)): # remove duplicate statistic key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}' if key in d_min_name_max_name_2_has_statistic: continue d_min_name_max_name_2_has_statistic[key] = True if _name_2 not in d_name_2_competitors: shared_competitor_counts.append(0) continue shared_num = len(competitors.intersection(d_name_2_competitors[_name_2])) shared_competitor_counts.append(shared_num) logs.new_line() logs.add('statistics', 'total count of competitors companies', f'{len(d_name_2_competitors)}', output=True) logs.add('statistics', 'mean of shared competitors', f'among competitors: {np.mean(shared_competitor_counts)}', output=True) logs.add('statistics', 'std of shared competitors', f'among competitors: {np.std(shared_competitor_counts)}', output=True) logs.add('statistics', 'max of shared competitors', f'among competitors: {np.max(shared_competitor_counts)}', output=True) logs.add('statistics', 'min of shared competitors', f'among competitors: {np.min(shared_competitor_counts)}', output=True) bins = list(range(0, 53, 1)) plt.figure(figsize=(18, 8)) plt.hist(shared_competitor_counts, bins=bins, edgecolor='white') plt.title( f'histogram for count of shared competitors among competitors', fontsize=22) plt.xlabel('count of shared competitors for each similar company pair', fontsize=16) plt.ylabel('count of company pairs', fontsize=16) plt.xticks(bins) plt.savefig(path_lib.get_relative_file_path( 'runtime', 'analysis', 'figures', f'hist_for_shared_competitor_among_competitors.png'), dpi=300) plt.show() plt.close()
def add(_id, function, message, _level=LEVEL_MSG, output=False): # construct log message _time = str(time.strftime('%Y-%m-%d %H:%M:%S')) string = f'{_id} : {_level} : {_time} : {function} : {message}\n' # write log with open(path_lib.get_relative_file_path('log', MODEL, f'{VARIANT}.log'), 'ab') as f: f.write(string.encode('utf-8')) # show to console if output: print(string.strip())
import pandas as pd import numpy as np from matplotlib import pyplot as plt from lib import path_lib # load the similarity data similarity_csv_path = path_lib.get_relative_file_path('runtime', 'result_csv', 'linkedin_similarity.csv') similarity_csv = pd.read_csv(similarity_csv_path) similarities = list(similarity_csv['similarity']) competitor_similarities = list(similarity_csv[similarity_csv['is_competitor'] == 'competitor']['similarity']) print(f'\n\ncount of all company pairs: {len(similarities)}') print(f'mean similarity for all company pairs: {np.mean(similarities)}') print(f'std similarity for all company pairs: {np.std(similarities)}') print(f'max similarity for all company pairs: {np.max(similarities)}') print(f'min similarity for all company pairs: {np.min(similarities)}') print(f'\ncount of competitor relationship: {len(competitor_similarities)}') print(f'mean similarity for competitors: {np.mean(competitor_similarities)}') print(f'std similarity for competitors: {np.std(competitor_similarities)}') print(f'max similarity for competitors: {np.max(competitor_similarities)}') print(f'min similarity for competitors: {np.min(competitor_similarities)}') plt.figure(figsize=(14, 8)) plt.subplot(211) plt.hist(similarities, bins=30, edgecolor='white') plt.title('histogram for similarity of all company pairs', fontsize=22) plt.xlabel('similarity', fontsize=16) plt.ylabel('count of company pairs', fontsize=16) plt.xticks(list(np.linspace(0.1, 1., 10)), fontsize=14)
import pandas as pd import numpy as np from matplotlib import pyplot as plt from lib import path_lib # load the json data json_path = path_lib.get_relative_file_path( 'runtime', 'competitor_linkedin_dict_format_v4.json') tmp = path_lib.load_json(json_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[ 'd_min_linkedin_name_max_linkedin_name'] d_name = {} for min_name_max_name in d_min_linkedin_name_max_linkedin_name.keys(): name_1, name_2 = min_name_max_name.split('____') d_name[name_1] = True d_name[name_2] = True # load the similarity data similarity_csv_path = path_lib.get_relative_file_path( 'runtime', 'result_csv', 'linkedin_similarity.csv') similarity_csv = pd.read_csv(similarity_csv_path) similarity_data = np.array(similarity_csv) similarity_data = list( filter(lambda x: x[0] in d_name or x[2] in d_name, similarity_data)) similarities = list(map(lambda x: x[-2], similarity_data)) competitor_similarities = list(similarity_csv[similarity_csv['is_competitor'] == 'competitor']['similarity'])
import numpy as np import pandas as pd from scipy.spatial.distance import cdist from lib import path_lib from config.path import VERSION print('\nloading the embeddings ... ') # load embeddings pkl_path = path_lib.get_relative_file_path('runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl') company_embeddings = path_lib.read_cache(pkl_path) X, names = list(zip(*company_embeddings)) X = np.array(X) names = np.array(names) print('\ncalculating the cosine distance ... ') # calculate the cosine distance distances = cdist(X, X, 'cosine') # get the results with the top k minimal cosine distance similarities = 1 - np.tanh(distances) print('\nloading the competitor data ... ') # load linkedin data json_path = path_lib.get_relative_file_path('runtime', f'competitor_linkedin_dict_format_{VERSION}.json') tmp = path_lib.load_json(json_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp['d_min_linkedin_name_max_linkedin_name']
import numpy as np from scipy.spatial.distance import cdist from matplotlib import pyplot as plt from lib import path_lib from lib import logs print('\nloading the competitor data ... ') # load the json data json_path = path_lib.get_relative_file_path('runtime', 'competitor_linkedin_dict_format_v4.json') tmp = path_lib.load_json(json_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp['d_min_linkedin_name_max_linkedin_name'] print('\nformatting the competitor data structure ... ') d_name_2_competitors = {} name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys()) for min_name_max_name in name_pairs: name_1, name_2 = min_name_max_name.split('____') if name_1 not in d_name_2_competitors: d_name_2_competitors[name_1] = set() d_name_2_competitors[name_1].add(name_2) if name_2 not in d_name_2_competitors: d_name_2_competitors[name_2] = set() d_name_2_competitors[name_2].add(name_1) print('\nloading the embeddings ... ')
def __init__(self, use_cache=True): self.__competitor_path = path_lib.get_relative_file_path( 'runtime', f'competitor_linkedin_dict_format_{VERSION}.json') self.__data = self.__load(use_cache)
import pandas as pd import numpy as np from matplotlib import pyplot as plt from lib import path_lib from lib import format_name # load the json data json_path = path_lib.get_relative_file_path( 'runtime', 'competitor_linkedin_dict_format_v4.json') tmp = path_lib.load_json(json_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[ 'd_min_linkedin_name_max_linkedin_name'] # get all the competitor names names = list( map(lambda x: x.split('____'), d_min_linkedin_name_max_linkedin_name.keys())) names_1, names_2 = list(zip(*names)) names = list(set(names_1 + names_2)) path_lib.write_json( path_lib.get_relative_file_path('runtime', 'competitor_names.json'), names) public_csv = pd.read_csv( path_lib.get_relative_file_path('QualityControl_2016_result.csv')) public_data = np.array(public_csv) columns = list(public_csv.columns) print('\nexamples of the QualityControl_2016_result') for i, v in enumerate(columns): print(f'{i} : {v} : {public_data[0][i]} : {public_data[1][i]}')
def __load(self, use_cache): """ Load the data as embeddings """ cache_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'neg_rate_{self.__negative_rate}_v2.pkl') if use_cache and os.path.isfile(cache_path): return path_lib.read_cache(cache_path) print(f'\nloading data from {self.__competitor_path} ...') tmp = path_lib.load_json(self.__competitor_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[ 'd_min_linkedin_name_max_linkedin_name'] names = list(d_linkedin_name_2_linkedin_val.keys()) print( 'generating the positive and negative competitor relationships ... ' ) data = [] print('loading sentence bert to generate embeddings ...') from sentence_transformers import SentenceTransformer self.__sentence_bert = SentenceTransformer( 'bert-large-nli-stsb-mean-tokens') for min_name_max_name in d_min_linkedin_name_max_linkedin_name.keys(): name_1, name_2 = min_name_max_name.split('____') # get features feature_1 = self.__choose_features( d_linkedin_name_2_linkedin_val[name_1]) feature_2 = self.__choose_features( d_linkedin_name_2_linkedin_val[name_2]) # add positive competitor relationship data.append([feature_1, feature_2, 1, name_1, name_2]) # add negative competitor relationship for i in range(self.__negative_rate): # randomly choose negative competitor relationship name_2_neg = self.__random_choose( names, name_1, d_min_linkedin_name_max_linkedin_name) feature_2_neg = self.__choose_features( d_linkedin_name_2_linkedin_val[name_2_neg]) # randomly choose negative competitor relationship name_1_neg = self.__random_choose( names, name_2, d_min_linkedin_name_max_linkedin_name) feature_1_neg = self.__choose_features( d_linkedin_name_2_linkedin_val[name_1_neg]) data.append([feature_1, feature_2_neg, 0, name_1, name_2_neg]) data.append([feature_1_neg, feature_2, 0, name_1_neg, name_2]) print('shuffling the data ...') random.shuffle(data) print('writing cache ...') path_lib.cache(cache_path, data) print('finish loading ') return data
def statistic(_top_k_similar): _top_k_idx = top_k_idx[:, -_top_k_similar:] _top_k_idx = _top_k_idx[::-1] print(f'\nstatistic the shared competitors for top {_top_k_similar} similar companies of all Linkedin companies ... ') # record statistics shared_competitor_counts = [] # to remove duplicate statistic d_min_name_max_name_2_has_statistic = {} length = len(names) for _i, _name_1 in enumerate(names): if _i % 2 == 0: progress = float(_i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') similar_names = names[_top_k_idx[_i]] for _j, _name_2 in enumerate(similar_names): # remove duplicate statistic key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}' if key in d_min_name_max_name_2_has_statistic: continue d_min_name_max_name_2_has_statistic[key] = True if _name_1 not in d_name_2_competitors or _name_2 not in d_name_2_competitors: shared_competitor_counts.append(0) continue competitor_set_1 = d_name_2_competitors[_name_1] competitor_set_2 = d_name_2_competitors[_name_2] shared_num = len(competitor_set_1.intersection(competitor_set_2)) shared_competitor_counts.append(shared_num) logs.new_line() logs.add('statistics', 'total count of companies', f'{len(names)}', output=True) logs.add('statistics', 'mean of shared competitors', f'among top {_top_k_similar} similar companies: {np.mean(shared_competitor_counts)}', output=True) logs.add('statistics', 'std of shared competitors', f'among top {_top_k_similar} similar companies: {np.std(shared_competitor_counts)}', output=True) logs.add('statistics', 'max of shared competitors', f'among top {_top_k_similar} similar companies: {np.max(shared_competitor_counts)}', output=True) logs.add('statistics', 'min of shared competitors', f'among top {_top_k_similar} similar companies: {np.min(shared_competitor_counts)}', output=True) num_0 = len(list(filter(lambda x: x == 0, shared_competitor_counts))) shared_competitor_counts = list(filter(lambda x: x > 0, shared_competitor_counts)) plt.figure(figsize=(14, 8)) plt.hist(shared_competitor_counts, bins=[0.1, 1, 2, 3, 4, 5, 10, 20, 40], edgecolor='white') plt.title( f'histogram for count of shared competitors among top {_top_k_similar} similar companies of all Linkedin companies\n(spike for ({num_0} zero shared competitors) is removed)', fontsize=22) plt.xlabel('count of shared competitors for each similar company pair', fontsize=16) plt.ylabel('count of company pairs', fontsize=16) plt.xticks([0, 1, 2, 3, 4, 5, 10, 20, 40]) plt.savefig( path_lib.get_relative_file_path('runtime', 'analysis', 'figures', f'hist_for_shared_competitor_among_top_{_top_k_similar}_similar_companies.png'), dpi=300) plt.show() plt.close()
def __load(self, use_cache): """ Load the data as embeddings """ cache_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'neg_rate_{self.__negative_rate}_start_{self.__start_ratio}_end_{self.__end_ratio}.pkl' ) if use_cache and os.path.isfile(cache_path): return path_lib.read_cache(cache_path) print(f'\nloading data from {self.__competitor_path} ...') with open(self.__competitor_path, 'rb') as f: tmp = json.load(f) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[ 'd_min_linkedin_name_max_linkedin_name'] self.__d_linkedin_name_2_embedding = path_lib.load_pkl( self.__embedding_path) print('splitting dataset ...') name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys()) name_pairs.sort() total_pairs = len(name_pairs) start_index = int(total_pairs * self.__start_ratio) end_index = int(total_pairs * self.__end_ratio) name_pairs = name_pairs[start_index:end_index] names = list(d_linkedin_name_2_linkedin_val.keys()) print( 'generating the positive and negative competitor relationships ... ' ) data = [] print( 'generating the positive and negative competitor relationships ... ' ) for min_name_max_name in name_pairs: name_1, name_2 = min_name_max_name.split('____') # get features feature_1 = self.__choose_features( name_1, d_linkedin_name_2_linkedin_val[name_1]) feature_2 = self.__choose_features( name_2, d_linkedin_name_2_linkedin_val[name_2]) # add positive competitor relationship data.append([feature_1, feature_2, 1, name_1, name_2]) # add negative competitor relationship for i in range(int(self.__negative_rate * 2)): if random.randint(0, 1) == 0: # randomly choose negative competitor relationship name_2_neg = self.__random_choose( names, name_1, d_min_linkedin_name_max_linkedin_name) feature_2_neg = self.__choose_features( name_2_neg, d_linkedin_name_2_linkedin_val[name_2_neg]) data.append( [feature_1, feature_2_neg, 0, name_1, name_2_neg]) else: # randomly choose negative competitor relationship name_1_neg = self.__random_choose( names, name_2, d_min_linkedin_name_max_linkedin_name) feature_1_neg = self.__choose_features( name_1_neg, d_linkedin_name_2_linkedin_val[name_1_neg]) data.append( [feature_1_neg, feature_2, 0, name_1_neg, name_2]) print('shuffling the data ...') random.shuffle(data) print('writing cache ...') path_lib.cache(cache_path, data) print('finish loading ') return data
import numpy as np from scipy.spatial.distance import cdist from lib import path_lib from config.path import VERSION # load embeddings pkl_path = path_lib.get_relative_file_path( 'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl') company_embeddings = path_lib.read_cache(pkl_path) X, names = list(zip(*company_embeddings)) X = np.array(X) names = np.array(names) # format embeddings d_linkedin_name_2_embeddings = {} for i, embedding in enumerate(X): d_linkedin_name_2_embeddings[names[i]] = embedding # save results path_lib.cache( path_lib.get_relative_file_path( 'runtime', 'processed_input', f'd_linkedin_name_2_embeddings_{VERSION}.pkl'), d_linkedin_name_2_embeddings) # load the data json_path = path_lib.get_relative_file_path( 'runtime', f'competitor_linkedin_dict_format_{VERSION}.json') tmp = path_lib.load_json(json_path) d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val'] d_min_linkedin_name_max_linkedin_name = tmp[
find_string = 'test_evaluation : recall: ' if find_string in line: _recall.append(line[line.index(find_string) + len(find_string):]) find_string = 'test_evaluation : f1: ' if find_string in line: _f1.append(line[line.index(find_string) + len(find_string):]) return _top_k, _acc, _precision, _recall, _f1 data = [] log_dir = path_lib.create_dir_in_root('log', 'fixed_mean_sent_emb_similarity') for file_name in os.listdir(log_dir): file_path = os.path.join(log_dir, file_name) with open(file_path, 'rb') as f: content = f.readlines() content = list(map(lambda x: x.decode('utf-8').strip(), content)) data += list(zip(*parse_log(content))) data = list(map(list, data)) data.sort() df = pd.DataFrame(data, columns=['top_k', 'acc', 'precision', 'recall', 'f1']) df.to_csv(path_lib.get_relative_file_path('runtime', 'result_csv', 'test_similarity.csv'), index=False) print('\ndone')