def __init__(self, path_source_language, path_target_language, train_translation_dict_path, number_tokens=5000): """Proc and Proc-b Method Class. Args: path_source_language (path): Path to source Language. path_target_language (path): Path to target Language. train_translation_dict_path (path): Path to train translation dictionary. number_tokens (int): Number of tokens per language. """ # Built Embeddings self.source_embedding_word, self.source_embedding_matrix = load_embedding( path_source_language, number_tokens) self.target_embedding_word, self.target_embedding_matrix = load_embedding( path_target_language, number_tokens) # Built train/test dictionary self.train_translation_source, self.train_translation_target = load_translation_dict( train_translation_dict_path) # Built Word to index map self.src_word2ind = { word: i for i, word in enumerate(self.source_embedding_word) } self.trg_word2ind = { word: i for i, word in enumerate(self.target_embedding_word) } # Built Index to Word map self.src_ind2word = { i: word for i, word in enumerate(self.source_embedding_word) } self.trg_ind2word = { i: word for i, word in enumerate(self.target_embedding_word) } # Normalize Embeddings self.norm_src_embedding_matrix = normalize_matrix( self.source_embedding_matrix) self.norm_trg_embedding_matrix = normalize_matrix( self.target_embedding_matrix)
def test_simulated_gene_data(self): """ Test DE on a simulated gene expression matrix (w/ no biological variance) """ np.random.seed(0) sim_mat, cell_type, sim_de = simulate_matrix() # get scale scale = np.array(sim_mat.sum(axis=0)).squeeze() depth = (scale + 1) / np.median(scale) cov = [np.log(depth)] # precompute distribution params ntfmatrix = normalize_matrix(sim_mat, scale) alpha = atac_de.empirical_dispersion(ntfmatrix) # sseq_params = cr_de.compute_sseq_params(sim_mat) # alpha = sseq_params['phi_g'] de_res = atac_de.NBGLM_differential_expression( sim_mat, np.flatnonzero(cell_type == 0), np.flatnonzero(cell_type == 1), model='nb', test_params={ 'cov': cov, 'alpha': alpha }, verbose=False) sensitivity, ppv = evaluate_de_res(de_res, sim_de) assert sensitivity >= 0.94 assert ppv >= 0.94
def __init__(self, inputs, load_from=None, rand_init_params=None, gensim_w2v=None, dic=None): '''rand_init_params: (rng, (voc_dim, emb_dim)) ''' self.inputs = inputs if load_from is not None: W_values = pickle.load(load_from) elif rand_init_params is not None: rng, (voc_dim, emb_dim) = rand_init_params W_values = rand_matrix(rng, 1, (voc_dim, emb_dim)) if gensim_w2v is not None and dic is not None: assert gensim_w2v.vector_size == emb_dim n_sub = 0 for idx, word in dic._idx2word.items(): if word in gensim_w2v.wv: W_values[idx] = gensim_w2v.wv[word] n_sub += 1 print('Substituted words by word2vec: %d/%d' % (n_sub, voc_dim)) W_values = normalize_matrix(W_values) else: raise Exception('Invalid initial inputs!') self.W = theano.shared(value=W_values, name='emb_W', borrow=True) self.params = [self.W] self.outputs = self.W[inputs]
def test_normalize_matrix_standard(self): """ Implements unit tests related to the normalization of a matrix to the standard form. """ matrix = numpy.array([[1.0, 2.0, 3.0, 4.0], [3.0, 4.0, 5.0, 6.0], [5.0, 6.0, 7.0, 8.0]]) matrix_array = numpy.ravel(matrix) normalized_matrix_array = preprocessing.scale(matrix_array) normalized_matrix = normalized_matrix_array.reshape(3, 4) new_matrix = utils.normalize_matrix(matrix, 0, 1) self.assertSequenceEqual(new_matrix.tolist(), normalized_matrix.tolist())
def test_normalize_matrix_generic(self): """ Implements unit tests related to the normalization of a matrix given a certain mean and standard of deviation. """ matrix = numpy.array([[1.0, 2.0, 3.0, 4.0], [3.0, 4.0, 5.0, 6.0], [5.0, 6.0, 7.0, 8.0]]) new_matrix = utils.normalize_matrix(matrix, 2, 4) new_mean = numpy.mean(numpy.ravel(new_matrix)) new_std = numpy.std(numpy.ravel(new_matrix)) self.assertAlmostEqual(new_mean, 2) self.assertAlmostEqual(new_std, 4)
def word2vec(): """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :embedding_dim - size of embeddings :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab """ if verbose > 0: print_header_str('WORD2VEC') if (reuse_computed and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy') and os.path.isfile(vocab_dir+vocab_file+'.pkl')): if verbose > 0: print('Reusing word2vec vocab:', vocab_file) print('Reusing word2vec embeddings:', selected_embeddings_file) print_header_str('DONE') print() return dataset=[] for fn in [tweet_dir+emb_train_tweets_pos, tweet_dir+emb_train_tweets_neg,tweet_dir+emb_test_tweets]: with open(fn) as f: for line in f: tokens = line.strip().split() dataset.append(tokens) model = Word2Vec(dataset, size=embedding_dim, window=emb_context_window, min_count=emb_word_min_count, workers=6, iter=embedding_epochs, sg=1, compute_loss=True) X = model.wv.vectors if embedding_norm: X = normalize_matrix(X) np.save(embeddings_dir+selected_embeddings_file, X) vocab = dict() for idx, line in enumerate(model.wv.vocab): vocab[line.strip()] = idx with open(vocab_dir+vocab_file+'.pkl', 'wb') as f: pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) if verbose > 0: print('Vocabulary size:', len(vocab)) print('Training loss:', model.get_latest_training_loss()) print_header_str('DONE') print()
def get_topology(drones, mean, std, nrows, ncolumns): """ Computes a matrix topology given the locations of the drones and the mean and standard deviation of the elements in the matrix """ topology = np.zeros((nrows, ncolumns)) for drone in drones: topology[int(drone[0])][int(drone[1])] = 1 if settings.DISTANCE_ENCODING: topology = utils.sparse_to_distance(topology) if settings.NORMALIZE_DATA: topology = utils.normalize_matrix(topology, mean, std) return topology
def save_model_embeddings(model, opts): """Save model embeddings.""" dict_dir = opts.dict_dir embeddings = get_model_embeddings(model) if opts.normalize_embeddings: tf.logging.info("Normalize embeddings.") embeddings = utils.normalize_matrix(embeddings) embeddings[model_keys.PADDING_ID] = 0.0 tf.logging.debug('save embeddings = \n{}'.format(embeddings)) save_embeddings_path = os.path.join(dict_dir, model_keys.SAVE_EMBEDDINGS_NAME) np.save(save_embeddings_path, embeddings)
def augment_dictionary(self, growth_rate, limit): """Augment the Dictionary based on Proc-B method. Args: growth_rate (int): Growth rate of augmented dictionary. limit (int): Augmented Dictionary Limit Returns: """ # Find NN from projected source to (original) target embedding neighbors_projected_src_trg = find_nearest_neighbor( normalize_matrix(self.proj_embedding_source_target), self.norm_trg_embedding_matrix, use_batch=True) # Find NN from projected target embedding to (original) source embedding neighbors_projected_trg_src = find_nearest_neighbor( normalize_matrix(self.proj_embedding_target_source), self.norm_src_embedding_matrix, use_batch=True) # Find Matches matching = check_if_neighbors_match(neighbors_projected_src_trg, neighbors_projected_trg_src) # Make Sure that it does not grow fast rank_pairs = [[key, value] for key, value in matching.items()] cnt = min(int(growth_rate * len(self.train_translation_source)), limit) if cnt < len(rank_pairs): rank_pairs = rank_pairs[:cnt] # Update orignal Dictionary self.train_translation_source = [ self.src_ind2word[source_index] for source_index in [pair[0] for pair in rank_pairs] ] self.train_translation_target = [ self.trg_ind2word[target_index] for target_index in [pair[1] for pair in rank_pairs] ]
def create_source_target_embedding(self, test_translation_dict_path, use_layer=11): """Create Embeddings for each word in the given dictonary (single words). Args: test_translation_dict_path: path to dictionary use_layer: Layer to take embeddings from Returns: """ # Load Dictionary source_word_translation, target_word_translation = load_translation_dict( test_translation_dict_path) for source_index, source_word in tq.tqdm( enumerate(source_word_translation), total=len(source_word_translation)): # Word to index map self.src_word2ind[source_word] = source_index self.src_ind2word[source_index] = source_word embedding_each_term, _, _ = self.create_embedding_for_each_term( source_word, use_layer=use_layer) self.proj_embedding_source_target.append( embedding_each_term.squeeze()) del embedding_each_term torch.cuda.empty_cache() for target_index, target_word in tq.tqdm( enumerate(target_word_translation), total=len(target_word_translation)): # Word to index map self.trg_word2ind[target_word] = target_index self.trg_ind2word[target_index] = target_word embedding_each_term, _, _ = self.create_embedding_for_each_term( target_word, use_layer=use_layer) self.target_embedding_matrix.append(embedding_each_term.squeeze()) del embedding_each_term torch.cuda.empty_cache() self.proj_embedding_source_target = np.array( self.proj_embedding_source_target) self.target_embedding_matrix = np.array(self.target_embedding_matrix) self.norm_trg_embedding_matrix = normalize_matrix( self.target_embedding_matrix)
def save_model_nce_params(model, opts): """Save model nce weights and biases variables.""" dict_dir = opts.dict_dir nce_weights, nce_biases = get_model_nce_weights_and_biases(model) if opts.normalize_nce_weights: tf.logging.info("Normalize nce weihts.") nce_weights = utils.normalize_matrix(nce_weights) tf.logging.debug('save nce_weights = \n{}'.format(nce_weights)) tf.logging.debug('save nce_biases = \n{}'.format(nce_biases)) save_weights_path = os.path.join(dict_dir, model_keys.SAVE_NCE_WEIGHTS_NAME) save_biases_path = os.path.join(dict_dir, model_keys.SAVE_NCE_BIASES_NAME) np.save(save_weights_path, nce_weights) np.save(save_biases_path, nce_biases)
def main(args, outs): """Run this for each method x clustering key combination from split""" ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: return # Load the peak-BC matrix and a clustering and perform DE peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix) clustering_h5 = args.clustering_summary['h5'][args.method] clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, args.clustering_key) mask = clustering.clusters == args.cluster clustering.clusters[mask] = 1 clustering.clusters[np.logical_not(mask)] = 2 # find depth using peak matrix and normalize scale = np.array(peak_matrix.m.sum(axis=0)).squeeze() depth = (scale + 1) / np.median(scale) cov_peak = [np.log(depth)] diffexp_peak = nb2_diffexp.run_differential_expression(peak_matrix.m, clustering.clusters, model='poisson', impute_rest=True, test_params={'cov': cov_peak}, verbose=True) # find empirical estimates of alpha tf_matrix = None diffexp_tf = None # do DE on tf-BC matrix if args.filtered_tf_bc_matrix is not None: tf_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_tf_bc_matrix) ntfmatrix = normalize_matrix(tf_matrix.m, scale) alpha_tf = nb2_diffexp.empirical_dispersion(ntfmatrix) barcode_GC = get_barcode_gc(args.reference_path, args.peaks, peak_matrix) cov_tf = [barcode_GC, np.log(depth)] diffexp_tf = nb2_diffexp.run_differential_expression(tf_matrix.m, clustering.clusters, model='nb', impute_rest=True, test_params={'cov': cov_tf, 'alpha': alpha_tf}, verbose=True) # vstack diffexp = diffexp_peak if tf_matrix is None else cr_diffexp.DIFFERENTIAL_EXPRESSION(np.vstack([diffexp_peak.data, diffexp_tf.data])) # write out temp file np.savetxt(outs.tmp_diffexp, diffexp.data, delimiter=',') outs.enrichment_analysis = None outs.enrichment_analysis_summary = None
def build_input_structure(scenarios, results, list_scenarios, list_topologies): """ Function that constructs a structure that should be given as an input to our machine learning model. """ model_struct_orig = [] model_prediction = [] #first the topologies without any transformation index_scenario = 0 for scenario_id in list_scenarios: scenario_begin_index = (scenario_id - 1) * settings.SCENARIO_TOPOLOGIES_NO scenario_matrix = datarate_matrix(scenarios[scenario_id - 1]) if(settings.NORMALIZE_DATA): mean, std = utils.stats_matrix(scenario_matrix) for topology_id in list_topologies[index_scenario]: index_results = scenario_begin_index + topology_id - 1 topology_matrix, qualities_list = drones_matrix(results.loc[index_results]) if(settings.DISTANCE_ENCODING): topology_matrix = utils.sparse_to_distance(topology_matrix) if(settings.NORMALIZE_DATA): topology_matrix = utils.normalize_matrix(topology_matrix, mean, std) model_struct_orig.append([scenario_matrix, topology_matrix]) model_prediction.append(qualities_list) index_scenario += 1 if(settings.USE_TRANSFORMATIONS): #Then the topologies with a 90 rotation model_struct_rot1, model_pred_rot1 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.rotate, 90) #Then the topologies with a 180 rotation model_struct_rot2, model_pred_rot2 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.rotate, 180) #Then the topologies with a 270 rotation model_struct_rot3, model_pred_rot3 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.rotate, 270) #Then the topologies with a symmetry over the 0 axis model_struct_sym1, model_pred_sym1 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 0) #Then the topologies with a symmetry over the 45 axis model_struct_sym2, model_pred_sym2 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 45) #Then the topologies with a symmetry over the 90 axis model_struct_sym3, model_pred_sym3 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 90) #Then the topologies with a symmetry over the 135 axis model_struct_sym4, model_pred_sym4 = build_input_structure_transformation(model_struct_orig, model_prediction, list_scenarios, list_topologies, utils.symmetric, 135) return np.array(model_struct_orig + model_struct_rot1 + model_struct_rot2 + model_struct_rot3 + model_struct_sym1 + model_struct_sym2 + model_struct_sym3 + model_struct_sym4),(model_prediction + model_pred_rot1 + model_pred_rot2 + model_pred_rot3 + model_pred_sym1 + model_pred_sym2 + model_pred_sym3 + model_pred_sym4) else: return np.array(model_struct_orig), model_prediction
def normalize_matrix(self): utils.normalize_matrix(self.matrix)
def auto_overlap(prex=None, graph_name=None, emb_method_name1=None, emb_method_name2=None, binNum=None): time_start = time.time() print('----------------------------------------------------------') print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 + "," + emb_method_name2) results_base_dir = 'D:\hybridrec//results//' all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex results_dir = 'D:\hybridrec/results//' + prex graph_results_dir = results_dir + graph_name + '//' path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat" path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat" if not (os.path.exists(path_scores_method1) and os.path.exists(path_scores_method2)): print("dataset: " + graph_name + '----' + "baselines:" + emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算') if os.path.exists(path_scores_method1) and os.path.exists( path_scores_method2): # 获取归一化分数 scores_matrix_one_dict = (loadmat(path_scores_method1)) scores_matrix_two_dict = (loadmat(path_scores_method2)) scores_matrix_one = scores_matrix_one_dict['scores'] scores_matrix_two = scores_matrix_two_dict['scores'] if emb_method_name1 not in all_embedding_methods: scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A, k=1)) # k=1表示不包括对角线 if emb_method_name2 not in all_embedding_methods: scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1)) scores_matrix_one_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_one)) # 去掉传参的csr_matrix()则会 scores_matrix_two_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_two)) # 获取train_binary和test_binary graph_train_path = get_trainset_path(base_dir=all_file_dir, graph_name=graph_name, connected_pattern='undirected', from_zeros_one='0') graph_test_path = get_testset_path(base_dir=all_file_dir, graph_name=graph_name) G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = csr_matrix(np.triu(train_binary.A, k=1)) test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=train_binary.shape[0]) # 读取plus的原始分数(未归一化) plus_scores_name = 'plus_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat' plus_scores_path = graph_results_dir + plus_scores_name scores_matrix_plus_dict = (loadmat(plus_scores_path)) scores_matrix_plus = scores_matrix_plus_dict['scores'] # 读取multiply的原始分数(未归一化) multiply_scores_name = 'multiply_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat' multiply_scores_path = graph_results_dir + multiply_scores_name scores_matrix_multiply_dict = (loadmat(multiply_scores_path)) scores_matrix_multiply = scores_matrix_multiply_dict['scores'] # 读取MLP的原始分数(未归一化) mlp_scores_name = 'mlp_' + graph_name + '_' + emb_method_name1 + '_' + emb_method_name2 + '_scores.mat' mlp_scores_path = graph_results_dir + mlp_scores_name scores_matrix_mlp_dict = (loadmat(mlp_scores_path)) scores_matrix_mlp = scores_matrix_mlp_dict['scores'] # 归一化hybrid分数 scores_matrix_plus_norm = normalize_matrix( csr_matrix1=scores_matrix_plus) scores_matrix_multiply_norm = normalize_matrix( csr_matrix1=scores_matrix_multiply) scores_matrix_mlp_norm = normalize_matrix( csr_matrix1=scores_matrix_mlp) # 计算plus、multiply、mlp、PNR的rasterization grids mlp_path = results_base_dir + prex + graph_name + "//" + "mlp_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" mlp_dict = (loadmat(mlp_path)) mlp_raster_grids = mlp_dict["count"] multiply_path = results_base_dir + prex + graph_name + "//" + "multiply_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" multiply_dict = (loadmat(multiply_path)) multiply_raster_grids = multiply_dict["count"] plus_path = results_base_dir + prex + graph_name + "//" + "plus_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" plus_dict = (loadmat(plus_path)) plus_raster_grids = plus_dict["count"] # plus_raster_grids = rasterization_grids(binNum=binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_plus_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) # multiply_raster_grids = rasterization_grids(binNum=binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_multiply_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) # mlp_raster_grids = rasterization_grids(binNum=binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_mlp_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) PNR_path = results_base_dir + prex + graph_name + "//" + "PNR2_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" PNR_dict = (loadmat(PNR_path)) PNR_raster_grids = PNR_dict["count"] exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = csr_matrix( np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) # 获取plus的nonexist_scores_list nonexist_scores_plus_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=plus_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取multiply的nonexist_scores_list nonexist_scores_multiply_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=multiply_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取mlp的nonexist_scores_list nonexist_scores_mlp_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=mlp_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取PNR的nonexist_scores_list nonexist_scores_PNR_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=PNR_raster_grids, interval=float((1.0 - 0.0) / binNum), binNum=binNum) # 获取阈值 E_test = np.sum(test_binary.A) thresold_plus = get_list_thresold(nonexist_scores_plus_list, L=E_test) thresold_multiply = get_list_thresold(nonexist_scores_multiply_list, L=E_test) thresold_mlp = get_list_thresold(nonexist_scores_mlp_list, L=E_test) thresold_PNR = get_list_thresold(nonexist_scores_PNR_list, L=E_test) # 这里的trick, L=1/2 |E_test|!!!!!!!!!!! # thresold_plus = int(thresold_plus*0.5) # thresold_multiply = int(thresold_multiply * 0.5) # thresold_mlp = int(thresold_mlp * 0.5) # thresold_PNR = int(thresold_PNR * 0.5) # 修改grids plus_raster_grids = plus_raster_grids.A multiply_raster_grids = multiply_raster_grids.A mlp_raster_grids = mlp_raster_grids.A PNR_raster_grids = PNR_raster_grids.A # np.where(plus_raster_grids > thresold_plus, plus_raster_grids, 0) # np.where(multiply_raster_grids > thresold_multiply, multiply_raster_grids, 0) # np.where(mlp_raster_grids > thresold_mlp, mlp_raster_grids, 0) # np.where(PNR_raster_grids > thresold_PNR, PNR_raster_grids, 0) plus_raster_grids[plus_raster_grids <= thresold_plus] = 0.0 multiply_raster_grids[multiply_raster_grids <= thresold_multiply] = 0.0 mlp_raster_grids[mlp_raster_grids <= thresold_mlp] = 0.0 PNR_raster_grids[PNR_raster_grids <= thresold_PNR] = 0.0 plus_raster_grids[plus_raster_grids >= thresold_plus] = 1.0 multiply_raster_grids[multiply_raster_grids >= thresold_multiply] = 1.0 mlp_raster_grids[mlp_raster_grids >= thresold_mlp] = 1.0 PNR_raster_grids[PNR_raster_grids >= thresold_PNR] = 1.0 # 画图 # colors = ['OrangeRed', 'darkseagreen', 'dodgerblue', 'blueviolet'] colors = ['Red', 'green', 'blue', 'purple'] result = np.float32(PNR_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[0]) result = np.float32(plus_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-plus-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[1]) result = np.float32(multiply_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-multiply-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[2]) result = np.float32(mlp_raster_grids) result = cv2.GaussianBlur(result, (5, 5), 0) # (5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-mlp-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf_overlap(result=result, title=title, color=colors[3]) # # 计算plus的rasterization grids # plus_raster_grids = rasterization_grids(binNum=plus_binNum, # train_binary=train_binary, # scores_matrix_DNN=scores_matrix_plus_norm, # scores_matrix_one_norm=scores_matrix_one_norm, # scores_matrix_two_norm=scores_matrix_two_norm) # # plus_raster_grids = np.log10(plus_raster_grids) # 出现-inf而报错 # plus_raster_grids = normalize_matrix_full(csr_matrix1=csr_matrix(plus_raster_grids)) # plus_raster_grids = better_show_grids(csr_matrix1=plus_raster_grids) # # source = np.float32(plus_raster_grids.A) # result = cv2.GaussianBlur(source, (5, 5), 0) # title = graph_name + '-' + 'plus' +'-' + emb_method_name1 + '-' + emb_method_name2 # plot_contourf(result=result, title=title, binNum=10) # time_end = time.time() print("It takes : " + str((time_end - time_start) / 60.0) + " mins.") pass
def clew_induction(path_source_language, path_target_language, train_translation_dict_path, train_translation_dict_1k_path, test_translation_dict_path, new_test_translation_path, name_translation, number_tokens=100000, save_embedding=False): """Induce Cross Lingual Word Embeddings (Proc, Proc-B, VecMap) and Evaluate them on BLI task. Args: path_source_language (path): Path to Source Embedding. path_target_language (path): Path to Target Embedding. train_translation_dict_path (path): Path to Translation dictionary 5k train_translation_dict_1k_path (path): Path to Translation dictionary 1k test_translation_dict_path (path): Path to Translation test dictionary new_test_translation_path (path): Path to Translation test dictionary name_translation (str): name of saved files number_tokens (int): number of tokens used for monolingual word embeddings save_embedding (boolean): To save or not save the created CLWE Returns: """ print( "\nFirst, we cut the test dictionaries to the monolingual vocabularies:" ) cut_dictionary_to_vocabulary(path_source_language, path_target_language, test_translation_dict_path, new_test_translation_path, number_tokens=number_tokens) test_translation_dict_path = new_test_translation_path # PROC - 5K dictionary print("--------------------------------") print("\nCreate procrustes model with 5000 translation pairs") proc_algorithm = Projection_based_clwe(path_source_language, path_target_language, train_translation_dict_path, number_tokens=number_tokens) proc_algorithm.proc(source_to_target=True) Evaluator(proc_algorithm, test_translation_dict_path).evaluation_on_BLI() if save_embedding: save_clew(proc_algorithm, name_translation + "_proc_5k") del proc_algorithm # PROC - 1K dictionary print("--------------------------------") print("\nCreate procrustes model with 1000 translation pairs") proc_algorithm = Projection_based_clwe(path_source_language, path_target_language, train_translation_dict_1k_path, number_tokens=number_tokens) proc_algorithm.proc(source_to_target=True) Evaluator(proc_algorithm, test_translation_dict_path).evaluation_on_BLI() if save_embedding: save_clew(proc_algorithm, name_translation + "_proc_1k") del proc_algorithm # PROC-B - 1K dictionary print("--------------------------------") print( "\nCreate procrustes bootstrapping model with 1000 translation pairs") proc_b_algorithm = Projection_based_clwe(path_source_language, path_target_language, train_translation_dict_1k_path, number_tokens=number_tokens) proc_b_algorithm.proc_bootstrapping(growth_rate=1.5, limit=10000) Evaluator(proc_b_algorithm, test_translation_dict_path).evaluation_on_BLI() if save_embedding: save_clew(proc_b_algorithm, name_translation + "_proc_b_1k") del proc_b_algorithm # Unsupervised VecMap print("--------------------------------") print("\nCreate VecMap model") vec_map = VecMap(path_source_language, path_target_language, number_tokens=100000) # Please use GPU if available and install cupy use_gpu = True vec_map.build_seed_dictionary(use_gpu) vec_map.training_loop(use_gpu) Evaluator(vec_map, test_translation_dict_path).evaluation_on_BLI() if save_embedding: vec_map.proj_embedding_source_target = normalize_matrix( vec_map.proj_embedding_source_target) vec_map.target_embedding_matrix = vec_map.norm_trg_embedding_matrix save_clew(vec_map, name_translation + "_vecmap") del vec_map # Text Encoder First Layer print("--------------------------------") print("\nCreate Text Encoder First Layer model") xlm_r = TextEncoders("xlm-r") xlm_r.create_source_target_embedding(test_translation_dict_path, use_layer=1) Evaluator(xlm_r, test_translation_dict_path).evaluation_on_BLI() del xlm_r # Text Encoder Last Layer print("--------------------------------") print("\nCreate Text Encoder Last Layer model") xlm_r_last_layer = TextEncoders("xlm-r") xlm_r_last_layer.create_source_target_embedding(test_translation_dict_path, use_layer=12) Evaluator(xlm_r_last_layer, test_translation_dict_path).evaluation_on_BLI() del xlm_r_last_layer
connected_pattern='undirected', from_zeros_one='0') G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = sp.csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = sp.csr_matrix(np.triu(train_binary.A, k=1)) # train_binary_full = train_binary.A # 或 train_binary = sp.csr_matrix(np.array(nx.to_numpy_matrix(G))) # 构建exist和nonexist的binary exist_binary = sp.csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = sp.csr_matrix(np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) # 分数归一化到[0.0, 1.0] scores_matrix_one_norm = normalize_matrix(csr_matrix1 = scores_matrix_one) scores_matrix_two_norm = normalize_matrix(csr_matrix1 = scores_matrix_two) # plot_matrix(scores_matrix_one_norm.A) # plot_matrix(scores_matrix_two_norm.A) del scores_matrix_one, scores_matrix_two gc.collect() # 划分bin val_max = 1.0 val_min = 0.0 # bin_array = sorted(divide_bin(val_max = val_max, val_min = val_min, binNum = binNum)) interval = float((val_max - val_min) / binNum)
def glove(): """Computes GloVe embeddings given a vocabulary and a corresponding cooccurrence matrix. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :embedding_dim - size of embeddings :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab :glove_polarization - polarization factor for embedding initialization (with rel. freq) """ if verbose > 0: print_header_str('EMBEDDINGS') if reuse_computed and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy'): if verbose > 0: print('Reusing embeddings:', selected_embeddings_file) print_header_str('DONE') print() return if verbose > 0: print("Loading cooccurrence matrix.") with open(vocab_dir+cooc_file+'.pkl', 'rb') as f: cooc = pickle.load(f) nmax = 100 if verbose > 0: print("\tUsing nmax =", nmax, ", with cooc.max() =", cooc.max(),end='\n\n') print("Initializing embeddings with U~[-.5,.5] distribution: ", (cooc.shape[0], embedding_dim+1), (cooc.shape[1], embedding_dim+1), flush=True, end='\n\n') xs = np.random.uniform(size=(cooc.shape[0], embedding_dim+1)) - .5 ys = np.random.uniform(size=(cooc.shape[1], embedding_dim+1)) - .5 xs /= (embedding_dim+1) ys /= (embedding_dim+1) # Bias term is incorporated in word embedding xs[:,embedding_dim] = 1 ys[:,embedding_dim-1] = 1 if glove_polarization > 0: if verbose > 0: print('Adding polarization to random initial embeddings. Factor:', glove_polarization, end='\n\n') ### Get bias for positive and negative words ### vocab_pos = pickle.load(open(tweet_dir+emb_polar_vocab.format('pos'), 'rb')) vocab_neg = pickle.load(open(tweet_dir+emb_polar_vocab.format('neg'), 'rb')) polarization = sentiment_polarization(vocab_pos, vocab_neg) vocab = pickle.load(open(vocab_dir+vocab_file+'.pkl', 'rb')) ############### Add polarization ################ split = (embedding_dim-1)//2 for word,id in vocab.items(): if word in polarization: polar = polarization[word] else: polar = .5 xs[id,:split] += glove_polarization*polar / (embedding_dim+1) xs[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1) ys[id,:split] += glove_polarization*polar / (embedding_dim+1) ys[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1) ################################################# eta = 0.05 alpha = 3 / 4 prev_loss = 0.0 data = [(i,j,n) for i,j,n in zip(cooc.row,cooc.col, cooc.data)] for ix, jy, n in data: w = min( 1., (n/nmax)**alpha ) x,y = xs[ix], ys[jy] increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) ) x_upd = xs[ix] + increase_mul*y y_upd = ys[jy] + increase_mul*x prev_loss += w * ( log(n) - np.dot(x_upd, y_upd) )**2 for epoch in range(embedding_epochs): loss = 0.0 random.shuffle(data) if verbose == 1: print_progress_bar(0,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss)) counter,missed_updates=0,0 for ix, jy, n in data: counter+=1 w = min( 1., (n/nmax)**alpha ) x,y = xs[ix], ys[jy] increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) ) x_upd = xs[ix] + increase_mul*y y_upd = ys[jy] + increase_mul*x loss_delta = w * ( log(n) - np.dot(x_upd, y_upd) )**2 # Undo the current update if (np.isnan(x_upd).any() or np.isinf(x_upd).any() or np.isnan(y_upd).any() or np.isinf(y_upd).any() or np.isnan(loss+loss_delta) or np.isinf(loss+loss_delta)): missed_updates += 1 loss += w * ( log(n) - np.dot(xs[ix], ys[jy]) )**2 if (counter % 5000 == 0 or counter == len(data)) and verbose == 1: print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss)) continue xs[ix] = x_upd ys[jy] = y_upd # Reset bias xs[ix,embedding_dim] = 1 ys[jy,embedding_dim-1] = 1 loss += loss_delta if (counter % 50000 == 0 or counter == len(data)) and verbose == 1: print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss)) ### BOLD DRIVER LEARNING RATE ### if prev_loss > loss or epoch==0: eta += 0.01*eta else: eta /= 2 prev_loss = loss ################################# if verbose > 0: print("Epoch {:2d} loss : {:10.2f}".format(epoch+1, loss)) print('Missed {:4d} updates due to overflow prevention'.format(missed_updates)) print('Current learning rate: {:1.3f}'.format(eta), end='\n\n', flush=True) if (epoch+1) % 10 == 0 and epoch+1 != embedding_epochs: X = xs[:,:embedding_dim] if embedding_norm: X = normalize_matrix(X) np.save(embeddings_dir+glove_embedding_file_suffix(epoch+1), X) # Note: the bias for xs is in position embedding_dim-1 X = xs[:,:embedding_dim] if embedding_norm: X = normalize_matrix(X) np.save(embeddings_dir+glove_embedding_file_suffix(embedding_epochs), X) if verbose > 0: print_header_str('DONE') print()
def auto_PNR(prex=None, graph_name=None, emb_method_name1=None, emb_method_name2=None): print('----------------------------------------------------------') time_start = time.time() # 初始化训练集和测试集的路径 # prex = 'preprocessing_code2//' # 改这里 all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex binNum = 50 # 改这里 emb_method_name1 = emb_method_name1.lower() # 改这里 emb_method_name2 = emb_method_name2.lower() # 改这里 print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 + "," + emb_method_name2) conf_method1 = None conf_method2 = None if emb_method_name1 in all_embedding_methods: config_path_method1 = 'conf/' + emb_method_name1 + '.properties' config_method1 = configparser.ConfigParser() config_method1.read(config_path_method1) conf_method1 = dict(config_method1.items("hyperparameters")) if emb_method_name2 in all_embedding_methods: config_path_method2 = 'conf/' + emb_method_name2 + '.properties' config_method2 = configparser.ConfigParser() config_method2.read(config_path_method2) conf_method2 = dict(config_method2.items("hyperparameters")) # 初始化embedding和scores的路径 results_dir = 'D:\hybridrec/results//' + prex graph_results_dir = results_dir + graph_name + '//' # 计算emb method 1 if not ((emb_method_name1 == 'arope') or (emb_method_name1 == 'graph2gauss') or (is_heuristic_method(emb_method_name1) == True)): graph_train_path = get_trainset_path( base_dir=all_file_dir, graph_name=graph_name, connected_pattern=get_connp(emb_method_name1), from_zeros_one=get_from_zeros_one(emb_method_name1)) graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name1 + '.emb' if not os.path.isfile(graph_results_path): run_emb_method(input=graph_train_path, output=graph_results_path, emb_method_name=emb_method_name1) # 计算emb method 2 if not ((emb_method_name2 == 'arope') or (emb_method_name2 == 'graph2gauss') or (is_heuristic_method(emb_method_name2) == True)): graph_train_path = get_trainset_path( base_dir=all_file_dir, graph_name=graph_name, connected_pattern=get_connp(emb_method_name2), from_zeros_one=get_from_zeros_one(emb_method_name2)) graph_results_path = graph_results_dir + graph_name + '_' + emb_method_name2 + '.emb' if not os.path.isfile(graph_results_path): run_emb_method(input=graph_train_path, output=graph_results_path, emb_method_name=emb_method_name2) # 计算scores1 if conf_method1 != None: embedding_size_method1 = int(conf_method1['embedding_size']) if emb_method_name1 == 'splitter': scores_matrix_one = inner_product_scores_splitter( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1 + 1, skiprows=1, delimiter=',') elif (emb_method_name1 == 'attentionwalk') or (emb_method_name1 == 'grarep'): scores_matrix_one = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1 + 1, skiprows=1, delimiter=',') elif (emb_method_name1 == 'drne') or (emb_method_name1 == 'prune'): scores_matrix_one = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1, skiprows=0, delimiter=' ') # embedding_size_method有一些是要+1有一些不需要的 elif (emb_method_name1 == 'arope'): scores_matrix_one = inner_product_scores_arope( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif (emb_method_name1 == 'graph2gauss'): scores_matrix_one = energy_kl_scores_graph2gauss( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif is_heuristic_method(emb_method_name1): scores_matrix_one = heuristic_scores( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir, heuristic_method=emb_method_name1) else: scores_matrix_one = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name1, col_start=0, col_end=embedding_size_method1 + 1, skiprows=1, delimiter=' ') # 计算scores2 if conf_method2 != None: embedding_size_method2 = int(conf_method2['embedding_size']) if emb_method_name2 == 'splitter': scores_matrix_two = inner_product_scores_splitter( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2 + 1, skiprows=1, delimiter=',') elif (emb_method_name2 == 'attentionwalk') or (emb_method_name2 == 'grarep'): scores_matrix_two = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2 + 1, skiprows=1, delimiter=',') elif (emb_method_name2 == 'drne') or (emb_method_name2 == 'prune'): scores_matrix_two = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2, skiprows=0, delimiter=' ') elif (emb_method_name2 == 'arope'): scores_matrix_two = inner_product_scores_arope( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif (emb_method_name2 == 'graph2gauss'): scores_matrix_two = energy_kl_scores_graph2gauss( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir) elif is_heuristic_method(emb_method_name2): scores_matrix_two = heuristic_scores( all_file_dir=all_file_dir, graph_name=graph_name, graph_results_dir=graph_results_dir, heuristic_method=emb_method_name2) else: scores_matrix_two = inner_product_scores( graph_results_dir=graph_results_dir, dataset_name=graph_name, emb_method_name=emb_method_name2, col_start=0, col_end=embedding_size_method2 + 1, skiprows=1, delimiter=' ') # scores取上三角(注意:1、前面需要保证所有的分数在右上角或占满整个矩阵。2、前面有些是右上角,有些是占满整个矩阵) # scores_matrix_one_full = scores_matrix_one.A # scores_matrix_two_full = scores_matrix_two.A # plot_matrix(matrix = scores_matrix_one_full) # plot_matrix(matrix = scores_matrix_two_full) scores_matrix_one = sp.csr_matrix(np.triu(scores_matrix_one.A, k=1)) # k=1表示不包括对角线 scores_matrix_two = sp.csr_matrix(np.triu(scores_matrix_two.A, k=1)) # 读入train的binary数据 graph_train_path = get_trainset_path(base_dir=all_file_dir, graph_name=graph_name, connected_pattern='undirected', from_zeros_one='0') G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = sp.csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = sp.csr_matrix(np.triu(train_binary.A, k=1)) # train_binary_full = train_binary.A # 或 train_binary = sp.csr_matrix(np.array(nx.to_numpy_matrix(G))) # 构建exist和nonexist的binary exist_binary = sp.csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = sp.csr_matrix( np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) # 分数归一化到[0.0, 1.0] scores_matrix_one_norm = normalize_matrix(csr_matrix1=scores_matrix_one) scores_matrix_two_norm = normalize_matrix(csr_matrix1=scores_matrix_two) # plot_matrix(scores_matrix_one_norm.A) # plot_matrix(scores_matrix_two_norm.A) del scores_matrix_one, scores_matrix_two gc.collect() # 划分bin val_max = 1.0 val_min = 0.0 # bin_array = sorted(divide_bin(val_max = val_max, val_min = val_min, binNum = binNum)) interval = float((val_max - val_min) / binNum) # 获取exist_binary和nonexist_binary的分数 exist_scores_one_list = (np.array(scores_matrix_one_norm[exist_binary > 0], dtype=float))[0] nonexist_scores_one_list = (np.array( scores_matrix_one_norm[nonexist_binary > 0], dtype=float))[0] exist_scores_two_list = (np.array(scores_matrix_two_norm[exist_binary > 0], dtype=float))[0] nonexist_scores_two_list = (np.array( scores_matrix_two_norm[nonexist_binary > 0], dtype=float))[0] # # 变为稀疏矩阵 # exist_scores_one_list_csr = sp.csr_matrix(exist_scores_one_list) # nonexist_scores_one_list_csr = sp.csr_matrix(nonexist_scores_one_list) # exist_scores_two_list_csr = sp.csr_matrix(exist_scores_two_list) # nonexist_scores_two_list_csr = sp.csr_matrix(nonexist_scores_two_list) # temp = scores_matrix_one_norm[exist_binary > 0][0] # 我怕在把分数变为list的时候出问题 # 初始化两个大小为binNum* bnNum的二维栅格 exist_raster_grids = np.zeros((binNum, binNum)) nonexist_raster_grids = np.zeros((binNum, binNum)) # 计算落在exist_raster_grids栅格的existing links的数量 exist_links_num = len(exist_scores_one_list) exist_row_col_zero_num = 0 # 那些两个矩阵的分数都是0的不作统计 for i in range(exist_links_num): # row_index和col_index的范围从0-->binNum-1 if (exist_scores_one_list[i] == 0.0) & (exist_scores_two_list[i] == 0.0): exist_row_col_zero_num = exist_row_col_zero_num + 1 continue row_index = int( get_row_col_index(score=exist_scores_one_list[i], interval=interval, binNum=binNum)) col_index = int( get_row_col_index(score=exist_scores_two_list[i], interval=interval, binNum=binNum)) exist_raster_grids[row_index, col_index] = exist_raster_grids[row_index, col_index] + 1 print("exist_row_col_zero_num:" + str(exist_row_col_zero_num)) print('sum exist_raster_grids:' + str(np.sum(exist_raster_grids))) # 计算落在nonexist_raster_grids栅格的nonexisting links的数量 nonexist_links_num = len(nonexist_scores_one_list) nonexist_row_col_zero_num = 0 # 那些两个矩阵的分数都是0的不作统计 for i in range(nonexist_links_num): # row_index和col_index的范围从0-->binNum-1 if (nonexist_scores_one_list[i] <= 0.0) & (nonexist_scores_two_list[i] <= 0.0): nonexist_row_col_zero_num = nonexist_row_col_zero_num + 1 continue row_index = int( get_row_col_index(score=nonexist_scores_one_list[i], interval=interval, binNum=binNum)) col_index = int( get_row_col_index(score=nonexist_scores_two_list[i], interval=interval, binNum=binNum)) nonexist_raster_grids[row_index, col_index] = nonexist_raster_grids[row_index, col_index] + 1 print("nonexist_row_col_zero_num:" + str(nonexist_row_col_zero_num)) print('sum nonexist_raster_grids:' + str(np.sum(nonexist_raster_grids))) # 计算PNR分数 N = train_binary.shape[0] print("Graph size:" + str(N) + '\n') L_T = np.sum(train_binary.A) O = N * (N - 1) / 2 coefficient = (O - L_T) / L_T PNR1 = coefficient * (exist_raster_grids / (nonexist_raster_grids + 1) ) # 分母加1避免出现inf或nan,不影响evaluation但是可能好看 PNR2 = (exist_raster_grids / nonexist_raster_grids) # inf和nan置为0 PNR2[np.isnan(PNR2)] = 0 PNR2[np.isinf(PNR2)] = 0 PNR2 = coefficient * PNR2 # 画图(注意:图的横纵坐标是从左上角开始的而不是想象中的左上角) # sns.heatmap(PNR1, cmap='Reds') # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_' +'bin_' + str(binNum) + "_PNR1.jpg") # plt.show() # sns.heatmap(PNR2, cmap='Reds') # plt.savefig(graph_results_dir + emb_method_name1 +'_'+ emb_method_name2 + '_'+ 'bin_' + str(binNum) + "_PNR2.jpg") # plt.show() # plt.matshow(PNR1) # 好丑 # plt.show() # 保存(exist_raster_grids、nonexist_raster_grids、PNR1、PNR2) save_ndarray_to_mat(exist_raster_grids, 'exist_raster_grids', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) save_ndarray_to_mat(nonexist_raster_grids, 'nonexist_raster_grids', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) save_ndarray_to_mat(PNR1, 'PNR1', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) save_ndarray_to_mat(PNR2, 'PNR2', graph_results_dir, graph_name, emb_method_name1, emb_method_name2, binNum) # PNR调整分数(只调整non-existing link的部分) nonexist_scores_PNR_list = transfer_scores_PNR( scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, train_binary=train_binary, PNR=PNR2, interval=interval, binNum=binNum) # weighted hybird方法的分数,0.5均权直接相加 scores_matrix_hybrid_norm = 0.5 * scores_matrix_one_norm + 0.5 * scores_matrix_two_norm nonexist_scores_hybrid_list = (np.array( scores_matrix_hybrid_norm[nonexist_binary > 0], dtype=float))[0] # 评估evaluation graph_test_path = get_testset_path(base_dir=all_file_dir, graph_name=graph_name) test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=N) L_full = int(np.sum(test_binary)) L_array = np.array([ int(L_full / 20), int(L_full / 10), int(L_full / 5), int(L_full / 2), L_full ]) del scores_matrix_one_norm, scores_matrix_two_norm, exist_scores_one_list, exist_scores_two_list, scores_matrix_hybrid_norm gc.collect() AP_PNR, AUC_PNR, Precision_PNR, Recall_PNR, F1score_PNR=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_PNR_list, L_array=L_array) AP_method1, AUC_method1, Precision_method1, Recall_method1, F1score_method1=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_one_list, L_array=L_array) AP_method2, AUC_method2, Precision_method2, Recall_method2, F1score_method2=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_two_list, L_array=L_array) AP_weighted, AUC_weighted, Precision_weighted, Recall_weighted, F1score_weighted=\ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_hybrid_list, L_array=L_array) print('AP_PNR: ' + str(AP_PNR)) print('AP_method1: ' + str(AP_method1)) print('AP_method2: ' + str(AP_method2)) print('AP_weighted: ' + str(AP_weighted)) print('\n') print('AUC_PNR: ' + str(AUC_PNR)) print('AUC_method1: ' + str(AUC_method1)) print('AUC_method2: ' + str(AUC_method2)) print('AUC_weighted: ' + str(AUC_weighted)) print('\n') print('Precision_PNR: ' + str(Precision_PNR)) print('Precision_method1: ' + str(Precision_method1)) print('Precision_method2: ' + str(Precision_method2)) print('Precision_weighted: ' + str(Precision_weighted)) print('\n') print('Recall_PNR: ' + str(Recall_PNR)) print('Recall_method1: ' + str(Recall_method1)) print('Recall_method2: ' + str(Recall_method2)) print('Recall_weighted: ' + str(Recall_weighted)) print('\n') print('F1score_PNR: ' + str(F1score_PNR)) print('F1score_method1: ' + str(F1score_method1)) print('F1score_method2: ' + str(F1score_method2)) print('F1score_weighted: ' + str(F1score_weighted)) print('\n') write_to_excel(graph_name, emb_method_name1, emb_method_name2, Precision_PNR, Precision_method1, Precision_method2, Precision_weighted, Recall_PNR, Recall_method1, Recall_method2, Recall_weighted, F1score_PNR, F1score_method1, F1score_method2, F1score_weighted, AP_PNR, AP_method1, AP_method2, AP_weighted, AUC_PNR, AUC_method1, AUC_method2, AUC_weighted) time_end = time.time() print("time span: " + str((time_end - time_start) / 60.00) + " mins") # facebook_combined:bin=5, 1.5分钟 # facebook_combined:cn和pearson\aa和cn花了3.5分钟 # facebook_combined:graphdistance和cn花了11分钟 # facebook_combined: graphdistance和cn的PNE矩阵为全0 # facebooke_combined: attentionwalk和prone花了7.5分钟 # facebooke_combined: 有rootedpagerank的效果都很差; # arope比PNR好一点,SDNE和PRUE很差很差;drne和graph2gauss也是极差的但是PNR融合后表现极好; # blogcatalog:aa和ja花了3小时 # (path based--katz和graphdistance都十分慢,neighbor based和rank based很快) # google 15000 nodes: 2.5小时 print( '--------------------------------------------------------------------------------' ) pass
def auto_DNN(prex=None, graph_name=None, emb_method_name1=None, emb_method_name2=None, model_name=None, DNN_binNum=None): print('----------------------------------------------------------') print("dataset: " + graph_name + '\n' + "baselines:" + emb_method_name1 + "," + emb_method_name2) results_base_dir = 'D:\hybridrec//results//' all_file_dir = 'D:\hybridrec\dataset\split_train_test//' + prex results_dir = 'D:\hybridrec/results//' + prex graph_results_dir = results_dir + graph_name + '//' # (facebook_combined的规律:ratio越小则正负样本的预测准确率越高,花的时间也越少) ratio = 1 # 负样本的总数是正样 本的ratio倍 # 改这里 path_scores_method1 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name1 + "_scores.mat" path_scores_method2 = results_base_dir + prex + graph_name + "//" + graph_name + "_" + emb_method_name2 + "_scores.mat" # Initialize the model,改这里 # hidden_layer_sizes=(10, 20, 10):三个隐藏层,分别10、20、10个神经元 if model_name == "mlp": model = MLPClassifier(hidden_layer_sizes=(10, 20), activation='relu', solver='adam', max_iter=200, alpha=0.01, batch_size=256, learning_rate='constant', learning_rate_init=0.001, shuffle=False, random_state=2020, early_stopping=True, validation_fraction=0.2, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10) pass if model_name == "svm": model = SVC(C=5, random_state=42) # 出问题了 pass if model_name == "lr": model = LogisticRegression(C=5, penalty='l1', tol=1e-6, random_state=42) # penalty 有l1和l2 pass if model_name == "lgbm": model = LGBMClassifier(num_leaves=31, learning_rate=0.1, n_estimators=64, random_state=42, n_jobs=-1) pass if model_name == "xgb": model = XGBClassifier(max_depth=5, learning_rate=0.1, n_jobs=-1, nthread=-1, gamma=0.06, min_child_weight=5, subsample=1, colsample_bytree=0.9, reg_alpha=0, reg_lambda=0.5, random_state=42) pass if model_name == "ld": model = LinearDiscriminantAnalysis(solver='lsqr') pass if model_name == "rf": model = RandomForestClassifier(n_estimators=50, max_depth=20, min_samples_split=2, min_samples_leaf=5, max_features="log2", random_state=12) pass if not (os.path.exists(path_scores_method1) and os.path.exists(path_scores_method2)): print("dataset: " + graph_name + '----' + "baselines:" + emb_method_name1 + "," + emb_method_name2 + ': 分数未完全计算') if os.path.exists(path_scores_method1) and os.path.exists( path_scores_method2): # 获取归一化分数 scores_matrix_one_dict = (loadmat(path_scores_method1)) scores_matrix_two_dict = (loadmat(path_scores_method2)) scores_matrix_one = scores_matrix_one_dict['scores'] scores_matrix_two = scores_matrix_two_dict['scores'] if emb_method_name1 not in all_embedding_methods: scores_matrix_one = csr_matrix(np.triu(scores_matrix_one.A, k=1)) # k=1表示不包括对角线 if emb_method_name2 not in all_embedding_methods: scores_matrix_two = csr_matrix(np.triu(scores_matrix_two.A, k=1)) scores_matrix_one_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_one)) scores_matrix_two_norm = normalize_matrix( csr_matrix1=csr_matrix(scores_matrix_two)) # 获取train_binary和test_binary graph_train_path = get_trainset_path(base_dir=all_file_dir, graph_name=graph_name, connected_pattern='undirected', from_zeros_one='0') graph_test_path = get_testset_path(base_dir=all_file_dir, graph_name=graph_name) G = read_graph(weighted=0, input=graph_train_path, directed=0) train_binary = csr_matrix(nx.convert_matrix.to_scipy_sparse_matrix(G)) train_binary = csr_matrix(np.triu(train_binary.A, k=1)) test_binary = get_test_matrix_binary(graph_test_path=graph_test_path, N=train_binary.shape[0]) del scores_matrix_one, scores_matrix_two gc.collect() # 获取正样本的分数 exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 exist_scores_one_list = (np.array( scores_matrix_one_norm[exist_binary > 0], dtype=float))[0] exist_scores_two_list = (np.array( scores_matrix_two_norm[exist_binary > 0], dtype=float))[0] # 构建测试样本(正样本+负样本) X_train_1 = (np.array([exist_scores_one_list, exist_scores_two_list])).T X_train_0 = negative_samples( train_binary=train_binary, test_binary=test_binary, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm, ratio=ratio) Y_train_1 = np.random.randint(1, 2, X_train_1.shape[0]) Y_train_0 = np.random.randint(0, 1, X_train_0.shape[0]) X_train = np.vstack((np.array(X_train_1), np.array(X_train_0))) Y_train = (np.hstack((np.array(Y_train_1), np.array(Y_train_0)))).T time_start = time.time() # 模型训练 model.fit(X_train, Y_train) # 模型预测 preds_0 = model.predict(X_train_0) preds_1 = model.predict(X_train_1) print(np.sum(preds_0)) print(np.sum(preds_1)) preds_0_proba = model.predict_proba(X_train_0) preds_1_proba = model.predict_proba(X_train_1) # 模型预测 scores_matrix_DNN = predicted_scores_DNN( model=model, train_binary=train_binary, test_binary=test_binary, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm) save_DNN_hybrid_scores(scores_matrix_DNN=scores_matrix_DNN, method1=emb_method_name1, method2=emb_method_name2, graph_results_dir=graph_results_dir, dataset_name=graph_name, model_name=model_name) scores_matrix_DNN_norm = normalize_matrix( csr_matrix1=scores_matrix_DNN) # 计算DNN的rasterization grids DNN_raster_grids = rasterization_grids( binNum=DNN_binNum, train_binary=train_binary, scores_matrix_DNN=scores_matrix_DNN_norm, scores_matrix_one_norm=scores_matrix_one_norm, scores_matrix_two_norm=scores_matrix_two_norm) # DNN_raster_grids = np.log10(DNN_raster_grids) # 出现-inf而报错 DNN_raster_grids = normalize_matrix_full( csr_matrix1=csr_matrix(DNN_raster_grids)) DNN_raster_grids = better_show_grids(csr_matrix1=DNN_raster_grids) save_DNN_raster_scores(rastser_grids=DNN_raster_grids, method1=emb_method_name1, method2=emb_method_name2, graph_results_dir=graph_results_dir, dataset_name=graph_name, model_name=model_name, DNN_binNum=DNN_binNum) source = np.float32(DNN_raster_grids.A) result = cv2.GaussianBlur(source, (5, 5), 0) title = graph_name + '-' + model_name + '-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf(result=result, title=title, binNum=10) # 读取PNR grids PNR_path = results_base_dir + prex + graph_name + "//" + "PNR1_" + graph_name + "_" + emb_method_name1 + "_" + emb_method_name2 + "_50_count.mat" if is_excel_file_exist(PNR_path): PNR_dict = (loadmat(PNR_path)) PNR_matrix = PNR_dict["count"] PNR_matrix = better_show_grids(csr_matrix1=PNR_matrix) source = np.float32(PNR_matrix.A) result = cv2.GaussianBlur(source, (5, 5), 0) #(5, 5)表示高斯矩阵的长与宽都是5,标准差取0 title = graph_name + '-PNR-' + emb_method_name1 + '-' + emb_method_name2 plot_contourf(result=result, title=title, binNum=10) # 评估DNN exist_binary = csr_matrix(np.triu(train_binary.A, k=1)) # k=1表示不包括对角线 nonexist_binary = csr_matrix( np.triu(np.ones(exist_binary.shape), k=1) - exist_binary.A) nonexist_scores_DNN_list = (np.array( scores_matrix_DNN[nonexist_binary > 0], dtype=float))[0] L_full = int(np.sum(test_binary)) L_array = np.array([ int(L_full / 20), int(L_full / 10), int(L_full / 5), int(L_full / 2), L_full ]) AP_DNN, AUC_DNN, Precision_DNN, Recall_DNN, F1score_DNN = \ evaluators(train_binary=train_binary, test_binary=test_binary, scores_list=nonexist_scores_DNN_list, L_array=L_array) # print('AP_DNN: ' + str(AP_DNN)) # print('\n') # print('AUC_DNN: ' + str(AUC_DNN)) # print('\n') # print('Precision_DNN: ' + str(Precision_DNN)) # print('\n') # print('Recall_DNN: ' + str(Recall_DNN)) # print('\n') # print('F1score_DNN: ' + str(F1score_DNN)) # print('\n') # 把precision、recall、F1score、AP写入excel文件 DNN_write_to_excel(DL_name=model_name, dataset_name=graph_name, method1=emb_method_name1, method2=emb_method_name2, precision_DL=Precision_DNN, recall_DL=Recall_DNN, F1score_DL=F1score_DNN, AP_DL=AP_DNN) time_end = time.time() print("It takes : " + str((time_end - time_start) / 60.0) + " mins.") pass
def stanford_glove(): """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :embedding_dim - size of embeddings :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab """ if verbose > 0: print_header_str('STANFORD GLOVE') if (reuse_computed and os.path.isfile(embeddings_dir + selected_embeddings_file + '.npy') and os.path.isfile(vocab_dir + vocab_file + '.pkl')): if verbose > 0: print('Reusing word2vec vocab:', vocab_file) print('Reusing word2vec embeddings:', selected_embeddings_file) print_header_str('DONE') print() return dataset = [] stanford_root_dir = embeddings_dir + '../StanfordGloVe/' with open(stanford_root_dir + 'run.sh', 'w') as frun: frun.write(f"""\ #!/bin/bash set -e pushd {stanford_root_dir} make popd # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. # One optional argument can specify the language used for eval script: matlab, octave or [default] python CORPUS="{tweet_dir+emb_train_tweets_pos} {tweet_dir+emb_train_tweets_neg} {tweet_dir+emb_test_tweets}" VOCAB_FILE={stanford_root_dir+vocab_file}_cnt.txt COOCCURRENCE_FILE={stanford_root_dir}cooccurrence.bin COOCCURRENCE_SHUF_FILE={stanford_root_dir}cooccurrence.shuf.bin BUILDDIR={stanford_root_dir}build SAVE_FILE={stanford_root_dir+selected_embeddings_file}_tmp VERBOSE=2 MEMORY=8.0 VOCAB_MIN_COUNT={emb_word_min_count} VECTOR_SIZE={embedding_dim} MAX_ITER={embedding_epochs} WINDOW_SIZE={emb_context_window} BINARY=2 NUM_THREADS=6 X_MAX=100 echo echo "$ cat CORPUS | BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > VOCAB_FILE" cat $CORPUS | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > $VOCAB_FILE echo "$ cat CORPUS | BUILDDIR/cooccur -memory $MEMORY -vocab-file VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > COOCCURRENCE_FILE" cat $CORPUS | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > $COOCCURRENCE_FILE echo "$ BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < COOCCURRENCE_FILE > COOCCURRENCE_SHUF_FILE" $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE echo "$ BUILDDIR/glove -save-file SAVE_FILE -threads $NUM_THREADS -input-file COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file VOCAB_FILE -verbose $VERBOSE" $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE rm $COOCCURRENCE_FILE $COOCCURRENCE_SHUF_FILE """) stanford_glove_cmd = 'chmod +x ' + stanford_root_dir + 'run.sh && ' stanford_glove_cmd += stanford_root_dir + 'run.sh' run_script(stanford_glove_cmd) vocab_size = sum(1 for line in open( stanford_root_dir + selected_embeddings_file + '_tmp.txt', 'r')) vocab = {} embeddings = np.zeros((vocab_size, embedding_dim), dtype='float32') with open(stanford_root_dir + selected_embeddings_file + '_tmp.txt', 'r') as f: for i, l in enumerate(f): ll = l.strip().split(' ') word, emb = ll[0].strip(), [float(x.strip()) for x in ll[1:]] vocab[word] = i embeddings[i] = np.array(emb) if embedding_norm: embeddings = normalize_matrix(embeddings) np.save(embeddings_dir + selected_embeddings_file, embeddings) with open(vocab_dir + vocab_file + '.pkl', 'wb') as f: pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) cleanup_cmd = f'rm {stanford_root_dir+vocab_file}_cnt.txt {stanford_root_dir+selected_embeddings_file}_tmp.txt ; rm -rf {stanford_root_dir}build' run_script(cleanup_cmd) if verbose > 0: print('Vocabulary size:', len(vocab)) print_header_str('DONE') print()
def evaluation_on_BLI(self, verbose=0): """ Start Evaluation on given Test translation dictionary. Args: verbose: Set to 1, to see top 3 predictions of the first 5 words. Returns: """ ranking = [] iteration = 0 norm_proj_src_emb = normalize_matrix( self.CrossLingualModel.proj_embedding_source_target) for test_src_word, test_trg_word in zip(self.test_translation_source, self.test_translation_target): source_index = self.CrossLingualModel.src_word2ind[ test_src_word] if test_src_word in self.CrossLingualModel.src_word2ind.keys( ) else -1 target_index = self.CrossLingualModel.trg_word2ind[ test_trg_word] if test_trg_word in self.CrossLingualModel.trg_word2ind.keys( ) else -1 if source_index == -1 or target_index == -1: continue # Calculate Cos Similarity norm_proj_src_word_emb = norm_proj_src_emb[[source_index]] similarity_cos = np.dot( norm_proj_src_word_emb, np.transpose(self.CrossLingualModel.norm_trg_embedding_matrix)) # Find Closest Neighbors most_similar_trg_index = np.argsort(-similarity_cos[[0]]) find_rank = np.where( most_similar_trg_index == target_index)[1][0] + 1 ranking.append(find_rank) if iteration <= 5 and verbose: print("\nTest translation: {} -> {}".format( test_src_word, self.CrossLingualModel.trg_ind2word[target_index])) print("Predicted Top 3 Translations: {}, {}, {}".format( self.CrossLingualModel.trg_ind2word[most_similar_trg_index[ 0, 0]], self.CrossLingualModel.trg_ind2word[ most_similar_trg_index[0, 1]], self.CrossLingualModel.trg_ind2word[most_similar_trg_index[ 0, 2]])) iteration += 1 if len(ranking) == 0: print("NO MATCHING FOUND!") else: print("\n\nNumber of Test Translations: {}/{}".format( len(ranking), len(self.test_translation_source))) p1 = len([p for p in ranking if p <= 1]) / len(ranking) p5 = len([p for p in ranking if p <= 5]) / len(ranking) p10 = len([p for p in ranking if p <= 10]) / len(ranking) print("P@1: {}".format(p1)) print("P@5: {}".format(p5)) print("P@10: {}".format(p10)) mrr = sum([1.0 / p for p in ranking]) / len(ranking) print("\n\nMRR: {}".format(mrr))
def normalize_matrix(self): utils.normalize_matrix(self.matrix)