def evaluate(self, prefix, X1, X2, Y, record_details=False, names_1=None, names_2=None): ret = self.model.test((X1, X2), Y) logs.new_line(True) for indicator, score in ret.items(): logs.add(self.M.name, f'{prefix}_evaluation', f'{indicator}: {score}', logs.LEVEL_RET, True) if record_details: predict_y = self.model.predict_label( (self.__test_X1, self.__test_X2)) logs.new_line(True) for i, v in enumerate(predict_y): logs.add( self.M.name, 'test_samples', json.dumps({ 'ret': "success" if v == Y[i] else "fail", 'predict': int(v), 'ground_truth': int(Y[i]), 'name_1': names_1[i], 'name_2': names_2[i], }), logs.LEVEL_DETAIL, True)
def __init__(self): train_loader = Loader(self.M.data_params['neg_rate_train'], 0, self.M.data_params['train_ratio'], use_cache=False) val_loader = Loader(self.M.data_params['neg_rate_val'], self.M.data_params['train_ratio'], self.M.data_params['train_ratio'] + self.M.data_params['val_ratio'], use_cache=False) test_loader = Loader(self.M.data_params['neg_rate_test'], self.M.data_params['train_ratio'] + self.M.data_params['val_ratio'], 1.0, use_cache=False) self.__train_X1, self.__train_X2, self.__train_Y, self.__train_names_1, self.__train_names_2 = train_loader.all( ) self.__val_X1, self.__val_X2, self.__val_Y, self.__val_names_1, self.__val_names_2 = val_loader.all( ) self.__test_X1, self.__test_X2, self.__test_Y, self.__test_names_1, self.__test_names_2 = test_loader.all( ) logs.new_paragraph(True) logs.add( self.M.name, 'data_shape', json.dumps({ 'train_x': self.__train_X1.shape, 'train_y': self.__train_Y.shape, 'val_x': self.__val_X1.shape, 'val_y': self.__val_Y.shape, 'test_x': self.__test_X1.shape, 'test_y': self.__test_Y.shape, }), logs.LEVEL_DATA, True)
def statistic(): print(f'\nstatistic the shared competitors for competitors ... ') # record statistics shared_competitor_counts = [] # to remove duplicate statistic d_min_name_max_name_2_has_statistic = {} length = len(d_name_2_competitors) _i = 0 for _name_1, competitors in d_name_2_competitors.items(): if _i % 2 == 0: progress = float(_i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') _i += 1 for _j, _name_2 in enumerate(list(competitors)): # remove duplicate statistic key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}' if key in d_min_name_max_name_2_has_statistic: continue d_min_name_max_name_2_has_statistic[key] = True if _name_2 not in d_name_2_competitors: shared_competitor_counts.append(0) continue shared_num = len(competitors.intersection(d_name_2_competitors[_name_2])) shared_competitor_counts.append(shared_num) logs.new_line() logs.add('statistics', 'total count of competitors companies', f'{len(d_name_2_competitors)}', output=True) logs.add('statistics', 'mean of shared competitors', f'among competitors: {np.mean(shared_competitor_counts)}', output=True) logs.add('statistics', 'std of shared competitors', f'among competitors: {np.std(shared_competitor_counts)}', output=True) logs.add('statistics', 'max of shared competitors', f'among competitors: {np.max(shared_competitor_counts)}', output=True) logs.add('statistics', 'min of shared competitors', f'among competitors: {np.min(shared_competitor_counts)}', output=True) bins = list(range(0, 53, 1)) plt.figure(figsize=(18, 8)) plt.hist(shared_competitor_counts, bins=bins, edgecolor='white') plt.title( f'histogram for count of shared competitors among competitors', fontsize=22) plt.xlabel('count of shared competitors for each similar company pair', fontsize=16) plt.ylabel('count of company pairs', fontsize=16) plt.xticks(bins) plt.savefig(path_lib.get_relative_file_path( 'runtime', 'analysis', 'figures', f'hist_for_shared_competitor_among_competitors.png'), dpi=300) plt.show() plt.close()
def train(self, use_cache=True): print('\nBuilding model ({}) ...'.format(self.M.TIME)) self.model = self.M() print('\nTraining model ...') start_time = time.time() self.model.train(self.__X, self.__names, use_cache) train_time = time.time() - start_time print('\nFinish training') logs.add(self.M.name, 'training_time', f'{train_time}')
def train(self): print('\nBuilding model ({}) ...'.format(self.M.TIME)) self.model = self.M() print('\nTraining model ...') start_time = time.time() self.model.train((self.__train_X1, self.__train_X2), self.__train_Y, (self.__val_X1, self.__val_X2), self.__val_Y) train_time = time.time() - start_time print('\nFinish training') logs.add(self.M.name, 'training_time', f'{train_time}')
def __init__(self): o_loader = Loader(negative_rate=self.M.data_params['neg_rate'], use_cache=True) (self.__train_X1, self.__train_X2, self.__train_Y, self.__train_names_1, self.__train_names_2), \ (self.__val_X1, self.__val_X2, self.__val_Y, self.__val_names_1, self.__val_names_2), \ (self.__test_X1, self.__test_X2, self.__test_Y, self.__test_names_1, self.__test_names_2) = \ o_loader.train_val_test(self.M.data_params['train_ratio'], self.M.data_params['val_ratio']) logs.new_paragraph(True) logs.add( self.M.name, 'data_shape', json.dumps({ 'train_x': self.__train_X1.shape, 'train_y': self.__train_Y.shape, 'val_x': self.__val_X1.shape, 'val_y': self.__val_Y.shape, 'test_x': self.__test_X1.shape, 'test_y': self.__test_Y.shape, }), logs.LEVEL_DATA, True)
def __log(self): logs.new_line() logs.add(self.name, 'data_params', json.dumps(self.data_params), logs.LEVEL_PARAM, True) logs.add(self.name, 'train_params', json.dumps(self.train_params), logs.LEVEL_PARAM, True) logs.add(self.name, 'model_params', json.dumps(self.model_params), logs.LEVEL_PARAM, True) logs.add(self.name, 'monitor_params', json.dumps(self.monitor_params), logs.LEVEL_PARAM, True) logs.add(self.name, 'model_dir', self.model_dir, logs.LEVEL_PATH, True) logs.add(self.name, 'tensorboard_dir', self.tb_dir, logs.LEVEL_PATH, True)
def statistic(_top_k_similar): _top_k_idx = top_k_idx[:, -_top_k_similar:] _top_k_idx = _top_k_idx[::-1] print(f'\nstatistic the shared competitors for top {_top_k_similar} similar companies of all Linkedin companies ... ') # record statistics shared_competitor_counts = [] # to remove duplicate statistic d_min_name_max_name_2_has_statistic = {} length = len(names) for _i, _name_1 in enumerate(names): if _i % 2 == 0: progress = float(_i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') similar_names = names[_top_k_idx[_i]] for _j, _name_2 in enumerate(similar_names): # remove duplicate statistic key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}' if key in d_min_name_max_name_2_has_statistic: continue d_min_name_max_name_2_has_statistic[key] = True if _name_1 not in d_name_2_competitors or _name_2 not in d_name_2_competitors: shared_competitor_counts.append(0) continue competitor_set_1 = d_name_2_competitors[_name_1] competitor_set_2 = d_name_2_competitors[_name_2] shared_num = len(competitor_set_1.intersection(competitor_set_2)) shared_competitor_counts.append(shared_num) logs.new_line() logs.add('statistics', 'total count of companies', f'{len(names)}', output=True) logs.add('statistics', 'mean of shared competitors', f'among top {_top_k_similar} similar companies: {np.mean(shared_competitor_counts)}', output=True) logs.add('statistics', 'std of shared competitors', f'among top {_top_k_similar} similar companies: {np.std(shared_competitor_counts)}', output=True) logs.add('statistics', 'max of shared competitors', f'among top {_top_k_similar} similar companies: {np.max(shared_competitor_counts)}', output=True) logs.add('statistics', 'min of shared competitors', f'among top {_top_k_similar} similar companies: {np.min(shared_competitor_counts)}', output=True) num_0 = len(list(filter(lambda x: x == 0, shared_competitor_counts))) shared_competitor_counts = list(filter(lambda x: x > 0, shared_competitor_counts)) plt.figure(figsize=(14, 8)) plt.hist(shared_competitor_counts, bins=[0.1, 1, 2, 3, 4, 5, 10, 20, 40], edgecolor='white') plt.title( f'histogram for count of shared competitors among top {_top_k_similar} similar companies of all Linkedin companies\n(spike for ({num_0} zero shared competitors) is removed)', fontsize=22) plt.xlabel('count of shared competitors for each similar company pair', fontsize=16) plt.ylabel('count of company pairs', fontsize=16) plt.xticks([0, 1, 2, 3, 4, 5, 10, 20, 40]) plt.savefig( path_lib.get_relative_file_path('runtime', 'analysis', 'figures', f'hist_for_shared_competitor_among_top_{_top_k_similar}_similar_companies.png'), dpi=300) plt.show() plt.close()