def __init__(self, args): super(BiLSTM, self).__init__() self.args = args self.hidden_dim = args.hidden_dim self.batch_size = args.batch_size self.dropout = nn.Dropout(args.dropout) self.dropout_embed = nn.Dropout(args.dropout_embed) self.word_embedding = Embedding.Embedding(args.embed_num, args.embedding_dim) self.use_cuda = args.use_cuda self.use_pretrained_emb = args.use_pretrained_emb if self.use_pretrained_emb: pretrained_weight = loader.vector_loader(args.word_list) pretrained_weight = torch.FloatTensor(pretrained_weight) # print(pretrained_weight) self.word_embedding_static = Embedding.ConstEmbedding( pretrained_weight) self.lstm = nn.LSTM(args.embedding_dim * 2, args.hidden_dim, bidirectional=True, dropout=args.dropout_model) self.hidden2label1 = nn.Linear(args.hidden_dim * 2, args.hidden_dim) self.hidden2label2 = nn.Linear(args.hidden_dim, args.class_num) self.hidden = self.init_hidden(args.batch_size)
def Get_vector_of_difference_between_clusters(data_path, result_path, step, n_features, Embedding): #data_path=FLAGS.data_path #result_path=FLAGS.result_dir #step=FLAGS.step #n_features = FLAGS.n_features_embedding data1 = GetData(data_path + str(2 * step) + "/") print(data_path + str(2 * step) + "/") cluster1_mean = np.zeros(n_features) for d in data1: cluster1_mean += Embedding(d) cluster1_mean /= len(data1) data2 = GetData(data_path + str(2 * step + 1) + "/") cluster2_mean = np.zeros(n_features) for d in data2: cluster2_mean += Embedding(d) cluster2_mean /= len(data2) if not os.path.exists(result_path): os.makedirs(result_path) np.savetxt(result_path + str(2 * step) + "_" + str(2 * step + 1) + ".txt", cluster2_mean - cluster1_mean) np.savetxt(result_path + str(2 * step) + ".txt", cluster1_mean) np.savetxt(result_path + str(2 * step + 1) + ".txt", cluster2_mean)
def datasets_features(name_dataset_train=None, name_dataset_dev=None, creation_features=True): ############################################################################### # This function creates or loads all the main features to train the model # (train data, dev_data, vocabulary) # # Input: # name_dataset_train: name of the train dataset # name_dataset_dev: name of the dev dataset # creation_features: Boolean variable for the creation of the features # # Output: # train_data: Training data to train the model # dev_data: Dev data to test the model ############################################################################### try: path_train = dictionary_paths[name_dataset_train] except: path_train = None try: path_dev = dictionary_paths[name_dataset_dev] except: path_dev = None if creation_features: # Creation features prep.save_dataset(dev_test=False, dataset=path_train, limit_length=2) prep.save_dataset(dev_test=True, dataset=path_dev, limit_length=0) Vocabulary.create_vocabulary_unigram(VOCAB_UNIGRAM_SIZE) Vocabulary.create_vocabulary_bigrams(VOCAB_BIGRAM_SIZE) Embedding.embedding_matrix('vocab_unigram.npy') Embedding.embedding_matrix('vocab_bigram.npy', False) else: # load Train set train_x = Vocabulary.open_file('Train_data.utf8', True) train_y = Vocabulary.open_file('Train_label.txt', False) # load Dev set dev_x = Vocabulary.open_file('Dev_data.utf8', True) dev_y = Vocabulary.open_file('Dev_label.txt', False) # Creation of the training and dev dataset train_data = prepare_dataset(train_x, train_y, MAX_LENGTH) dev_data = prepare_dataset(dev_x, dev_y, MAX_LENGTH) return train_data, dev_data
def __init__(self, args, pretrained): super(Transfrmr_bidaf, self).__init__() self.embed = embed.Embedding(args, pretrained) # Encoder module self.encoder_ctxt = encode.Encoder_block(args, 2 * args.word_dim) self.encoder_ques = encode.Encoder_block(args, 2 * args.word_dim) #Attention Flow Layer self.att_weight_c = Linear(args.hidden_size * 2, 1, args.dropout) self.att_weight_q = Linear(args.hidden_size * 2, 1, args.dropout) self.att_weight_cq = Linear(args.hidden_size * 2, 1, args.dropout) self.N = args.Model_encoder_size self.dropout = nn.Dropout(p=args.dropout) #Model Encoding Layer self.Model_encoder = self.get_clones( encode.Encoder_block(args, 8 * args.word_dim), args.Model_encoder_size) # self.Model2start= Linear(16 * args.word_dim, 8 * args.word_dim,args.dropout) # self.Model2end = Linear(16 * args.word_dim, 8 * args.word_dim,args.dropout) # self.start_idx = Linear(16 * args.word_dim,1,args.dropout) # self.end_idx = Linear(16 * args.word_dim, 1, args.dropout) self.start_idx = nn.Linear(16 * args.word_dim, 1) self.end_idx = nn.Linear(16 * args.word_dim, 1)
def AutoEncoderLatent(data_path, feature_dimension_embedding, Embedding): data = GetData(data_path) random.shuffle(data) latent = np.zeros([len(data), feature_dimension_embedding]) for d in range(len(data)): latent[d] = Embedding(data[d]) return data, latent
def __init__(self, args): super(Model, self).__init__() self.embed_size = args.embed_size self.label_size = args.label_size self.topic_word_num = args.topic_word_num self.biLSTM_hidden_size = args.biLSTM_hidden_size self.biLSTM_hidden_num = args.biLSTM_hidden_num self.save_pred_emd_path = args.save_pred_emd_path self.word_num = args.word_num self.dropout = args.dropout self.word_alpha = args.word_alpha self.topic_alpha = args.topic_alpha self.embeddingTopic = nn.Embedding(self.topic_word_num, self.embed_size) self.embeddingText = nn.Embedding(self.word_num, self.embed_size) if args.using_pred_emb: load_emb_text = Embedding.load_predtrained_emb_zero( self.save_pred_emd_path, self.word_alpha.string2id, padding=True) load_emb_topic = Embedding.load_predtrained_emb_zero( self.save_pred_emd_path, self.topic_alpha.string2id, padding=False) # self.embeddingTopic = ConstEmbedding(load_emb_topic) # self.embeddingText = ConstEmbedding(load_emb_text) # self.embeddingTopic = nn.Embedding(args.topicWordNum, self.EmbedSize, sparse=True) # self.embeddingText = nn.Embedding(args.wordNum, self.EmbedSize, sparse=True) self.embeddingTopic = nn.Embedding(self.topic_word_num, self.embed_size) self.embeddingText = nn.Embedding(self.word_num, self.embed_size) self.embeddingTopic.weight.data.copy_(load_emb_topic) self.embeddingText.weight.data.copy_(load_emb_text) self.biLSTM = nn.LSTM(self.embed_size, self.biLSTM_hidden_size, dropout=self.dropout, num_layers=self.biLSTM_hidden_num, batch_first=True, bidirectional=True) self.linear1 = nn.Linear(self.biLSTM_hidden_size * 4, self.biLSTM_hidden_size // 2) self.linear2 = nn.Linear(self.biLSTM_hidden_size // 2, self.label_size)
def GetTheClosestPoint(point, data, Embedding): dist = 1e+10 res = '' for d in data: #basename = os.path.basename(d).split("_")[-embedding_size-1:-1] #local_dist = np.linalg.norm(point- np.array([float(f) for f in basename])) local_dist = np.linalg.norm(point - Embedding(d)) if local_dist < dist: dist = local_dist res = d return res
def __init__(self, data_path, n_bins, n_pictures_per_bin, n_features, Embedding): # download dataset------------------------------------------ self.data = glob(os.path.join(data_path, "*.jpg")) self.n_bins = n_bins self.n_features = n_features self.features = [] for d in self.data: self.features.append(Embedding(d)) self.features = np.array(self.features) self.n_pictures_per_bin = n_pictures_per_bin self.feature_n = 0 # -------------------------- self.step = 0 self.n_pictures = self.n_bins * self.n_pictures_per_bin self.show_images()
def __init__(self, args): super(BiLSTM, self).__init__() self.args = args self.hidden_dim = args.hidden_dim self.batch_size = args.batch_size self.dropout = nn.Dropout(args.dropout) self.dropout_embed = nn.Dropout(args.dropout_embed) self.word_embedding = Embedding.Embedding(args.embed_num, args.embedding_dim, padding_idx=args.padID) # self.word_embedding.reset_parameters() self.lstm = nn.LSTM(args.embedding_dim, args.hidden_dim, bidirectional=True, dropout=args.dropout_model) self.hidden2label1 = nn.Linear(args.hidden_dim * 2, args.hidden_dim) self.hidden2label2 = nn.Linear(args.hidden_dim, args.class_num) self.hidden = self.init_hidden(args.batch_size, args.use_cuda)
def ClusterData(data, embedding_size, Embedding, n_clusters, result_dir, step): if not os.path.exists(result_dir): os.makedirs(result_dir) latent = np.zeros([len(data), embedding_size]) for d in range(len(data)): #basename = os.path.basename(data[d]).split("_")[-embedding_size-1:-1] #latent[d] = np.array([float(f) for f in basename]) latent[d] = Embedding(data[d]) clusters = KMeans(n_clusters=n_clusters, random_state=200).fit(latent) n_ex = 128 images_indexes = [] #os.makedirs(result_dir + "/Anchor_im/") for c in range(n_clusters): cluster_images_index = [ i for i in range(len(data)) if clusters.labels_[i] == c ] images_indexes.append(cluster_images_index) print('cluster = ', c, len(cluster_images_index)) distancse_to_cluster = clusters.transform(latent[cluster_images_index]) distancse_to_cluster = distancse_to_cluster.T[c] arg_sort = np.argsort(distancse_to_cluster[:n_ex]) images_index = np.array(cluster_images_index) im = images_index[arg_sort] batch_files = [data[i] for i in im] batch = [ get_image(batch_file, input_height=100, input_width=150, resize_height=100, resize_width=150, crop=False, grayscale=False) for batch_file in batch_files ] imsave(np.array(batch[:144]), [12, 12], result_dir + '/test_' + str(step) + '_' + str(c) + "_.png") #subprocess.call(['cp', batch_files[0], result_dir + "/Anchor_im/" + os.path.basename(batch_files[0])]) return images_indexes, clusters
def embed(y, n_dt, code, P, n_code=32): """ """ y = y.flatten() Dc = tf.stt_my(y, n_dt=n_dt) if code == 'random': wmbits = np.random.choice(16, size=n_code) else: wmbits = pre.hex_to_code(code) ns = pre.code_to_hex(wmbits) signal_wmd = np.zeros_like(Dc) wt = pre.time_weight(Dc, N=n_code, L_n=P.shape[1]) for i in range(Dc.shape[1]): X_i = Dc[:, i] X_l, X_selected, X_h = pre.dct_segment_generate(X_i, N=n_code, L_n=P.shape[1]) Y = eb.watermark_embed(X_selected, P, wmbits, N=n_code, weight=wt[i]) Y_i = pre.dct_reconstruct(X_l, Y, X_h) signal_wmd[:, i] = Y_i embeded = tf.istt_my(signal_wmd, length=y.shape[0]) return embeded, ns
def CrossRecurrencePlot(x,y,m,t,e,distance,standardization = False, plot = False): """ It computes and plots the (cross)recurrence plot of the uni/multivariate input signal(s) x and y (in pandas DataFrame format). **Reference :** * N. Marwan, M. Carmen Romano, M. Thiel and J. Kurths. "Recurrence plots for the analysis of complex systems". Physics Reports 438(5), 2007. :param x: first input signal :type x: pd.DataFrame :param y: second input signal :type y: pd.DataFrame :param m: embedding dimension :type m: int :param t: embedding delay :type t: int :param eps: threshold for recurrence :type eps: float :param distance: It specifies which distance method is used. It can assumes the following values:\n 1. 'euclidean'; 2. 'maximum'; 3. 'manhattan' 4. 'fixed distance maximum norm' :type distance: str :param standardization: if True data are nomalize to zero mean and unitary variance. Default: False :type standardization: bool :param plot: if True the plot of correlation function is returned. Default: False :type standardization: bool """ ' Raise error if parameters are not in the correct type ' if not(isinstance(x, pd.DataFrame)) : raise TypeError("Requires x to be a pd.DataFrame") if not(isinstance(y, pd.DataFrame)) : raise TypeError("Requires y to be a pd.DataFrame") if not(isinstance(m, int)) : raise TypeError("Requires m to be an integer") if not(isinstance(t, int)) : raise TypeError("Requires t to be an integer") if not(isinstance(e, float)): raise TypeError("requires eps to be a float") if not(isinstance(distance, str)) : raise TypeError("Requires distance to be a string") if not(isinstance(standardization, bool)) : raise TypeError("Requires standardization to be a bool") if not(isinstance(plot, bool)) : raise TypeError("Requires plot to be a bool") ' Raise error if parameters do not respect input rules ' if m <= 0 : raise ValueError("Requires m to be positive and greater than 0") if t<= 0 : raise ValueError("Requires t to be positive and greater from 0") if e<0: raise ValueError("Requires eps to be positive") if distance != 'euclidean' and distance != 'maximum' and distance !='manhattan' and distance != 'rr': raise ValueError("Requires a valid way to compute distance") if standardization==True: x=Standardize.Standardize(x) y=Standardize.Standardize(y) if (m!=1) or (t!=1): x=Embedding.Embedding(x,m,t) y=Embedding.Embedding(y,m,t) vd=2 if(distance=='euclidean'): pass elif(distance=='manhattan'): vd=1 elif(distance=='maximum' or distance=='rr'): vd=np.inf crp_tmp=np.ones((x.shape[0],y.shape[0])) if distance!='rr': for i in range(0,x.shape[0]): x_row_rep_T=pd.concat([x.iloc[i,:]]*y.shape[0],axis=1,ignore_index=True) x_row_rep=x_row_rep_T.transpose() dist=Distance.Minkowski(x_row_rep,y,vd) diff_threshold_norm=e-dist diff_threshold_norm[diff_threshold_norm>=0]=0 diff_threshold_norm[diff_threshold_norm<0]=1 crp_tmp[x.shape[0]-1-i,:]=diff_threshold_norm.T crp=np.fliplr((crp_tmp).T) else: dist_m=np.zeros((x.shape[0],y.shape[0])) for i in range(0,x.shape[0]): x_row_rep_T=pd.concat([x.iloc[i,:]]*y.shape[0],axis=1,ignore_index=True) x_row_rep=x_row_rep_T.transpose() dist=Distance.Minkowski(x_row_rep,y,vd) dist_m[i,:]=dist.T dist_m_f=dist_m.flatten() dist_m_f.sort() e=dist_m_f[np.ceil(e*len(dist_m_f))] for i in range(0,x.shape[0]): diff_threshold_norm=e-dist_m[i,:].T diff_threshold_norm[diff_threshold_norm>=0]=0 diff_threshold_norm[diff_threshold_norm<0]=1 crp_tmp[x.shape[0]-1-i,:]=diff_threshold_norm.T crp=np.fliplr((crp_tmp).T) result = dict() result['crp']= crp if plot: plt.ion() figure = plt.figure() ax = figure.add_subplot(111) ax.set_xlabel('Time (in samples)') ax.set_ylabel('Time (in samples)') ax.set_title('Cross recurrence matrix') ax.imshow(result['crp'], plt.cm.binary_r, origin='lower',interpolation='nearest', vmin=0, vmax=1) return (result)
if FLAGS.PCA_GIST: data_path = FLAGS.data_path n_features_embedding = FLAGS.n_features_embedding n_features_new = FLAGS.new_features_embedding result_dir = FLAGS.result_dir data, latent = GIST_latent(data_path, n_features_embedding) PCA_(data, latent, n_features_new, result_dir) if FLAGS.PCA_AutoEncoder: data_path = FLAGS.data_path n_features_embedding = FLAGS.n_features_embedding n_features_new = FLAGS.new_features_embedding result_dir = FLAGS.result_dir data, latent = AutoEncoderLatent(data_path, n_features_embedding, Embedding(FLAGS.embedding)) PCA_(data, latent, n_features_new, result_dir) if FLAGS.My: data_path = FLAGS.data_path n_features_embedding = FLAGS.n_features_embedding result_dir = FLAGS.result_dir svm_path = FLAGS.svm data = My_latent(data_path, svm_path, n_features_embedding, result_dir) if FLAGS.AutoEncoderByDirection: data_path = FLAGS.data_path n_features_embedding = FLAGS.n_features_embedding result_dir = FLAGS.result_dir direction_path = FLAGS.direction_path data, latent = AutoEncoderLatent(data_path, 10, Embedding(FLAGS.embedding))
def RunHybrid(TableList): print("I am in Hybrid of Limaye") #reading the Limaye table os.chdir('./data/Limaye/tables_instance/') #contains prediction results mapped_Prediction_Dict = dict() RLU_precision = 0 RLU_recall = 0 RLU_F1 = 0 RLU_TP_Total = 0 RLU_FN_Total = 0 RLU_FP_Total = 0 #--------------- Emb_precision = 0 Emb_recall = 0 Emb_F1 = 0 Emb_TP_Total = 0 Emb_FN_Total = 0 Emb_FP_Total = 0 #--------------- HybridI_precision = 0 HybridI_recall = 0 HybridI_F1 = 0 HybridI_TP_Total = 0 HybridI_FN_Total = 0 HybridI_FP_Total = 0 #--------------- HybridII_precision = 0 HybridII_recall = 0 HybridII_F1 = 0 HybridII_TP_Total = 0 HybridII_FN_Total = 0 HybridII_FP_Total = 0 #change the path to have csv filename without the path inside os.chdir( '/home/yasamin/Codes/WebTableAnnotation/data/Limaye/tables_instance/') #reading all CSV files in path allCsvTableFiles = glob.glob('*.csv') #print(allCsvTableFiles) start_time = time.time() T = pd.DataFrame() i = 0 for table_csv in allCsvTableFiles: if (table_csv in TableList): os.chdir( '/home/yasamin/Codes/WebTableAnnotation/data/Limaye/tables_instance/' ) with open(table_csv, 'r', encoding='utf-8') as csvTableFile: print("This is the Table file name :\n\n", table_csv) T = pd.read_csv(table_csv, header=None) #switch path to normal form os.chdir('/home/yasamin/Codes/WebTableAnnotation/') #Remove the row if there is no GT in entity file for it. entity_csv = table_csv os.chdir( '/home/yasamin/Codes/WebTableAnnotation/data/Limaye/entities_instance/' ) with open(entity_csv, 'r', encoding='utf-8') as csvEntityFile: print("This is the Entity file name :\n\n", entity_csv) E = pd.read_csv(entity_csv, header=None) count_row = T.shape[0] for i in range(0, count_row): if not (i in E[E.columns[-1]].values): print("This i does not exists:", i) T = T.drop([i]) #switch path to normal form os.chdir('/home/yasamin/Codes/WebTableAnnotation/') #---------------------------------------------------------------- #uncomment for Refined Lookup #---------------------------------------------------------------- # RLU_TPrimeIsAnnotated , RLU_TPrimeAnnotation = fp.getFullPhase(T,table_csv) # print("Refined Lookup result:") # print("-"*30) # print(RLU_TPrimeAnnotation) # print(RLU_TPrimeIsAnnotated) # #metrics: # RLU_TP, RLU_FN, RLU_FP = MCal.MetricsCalcul(T,table_csv,RLU_TPrimeAnnotation,RLU_TPrimeIsAnnotated) # RLU_TP_Total = RLU_TP_Total+RLU_TP # RLU_FN_Total = RLU_FN_Total+RLU_FN # RLU_FP_Total = RLU_FP_Total+RLU_FP # RLU_precision, RLU_recall, RLU_F1 = PRF.Precision_Recall_F1(RLU_TP_Total,RLU_FN_Total,RLU_FP_Total) #---------------------------------------------------------------- #uncomment for Embedding #---------------------------------------------------------------- Emb_TPrimeIsAnnotated, Emb_TPrimeAnnotation = emb.getEmbedding( T) print("Embedding result:") print("-" * 30) print(Emb_TPrimeAnnotation) print(Emb_TPrimeIsAnnotated) #metrics: Emb_TP, Emb_FN, Emb_FP = MCal.MetricsCalcul( T, table_csv, Emb_TPrimeAnnotation, Emb_TPrimeIsAnnotated) Emb_TP_Total = Emb_TP_Total + Emb_TP Emb_FN_Total = Emb_FN_Total + Emb_FN Emb_FP_Total = Emb_FP_Total + Emb_FP Emb_precision, Emb_recall, Emb_F1 = PRF.Precision_Recall_F1( Emb_TP_Total, Emb_FN_Total, Emb_FP_Total) #---------------------------------------------------------------- #uncomment for Hybrid I(uncomment also embedding and RLU) #---------------------------------------------------------------- # #Hybrid1 # HybridI_TPrimeIsAnnotated = RLU_TPrimeIsAnnotated # HybridI_TPrimeAnnotation = RLU_TPrimeAnnotation # for k in HybridI_TPrimeIsAnnotated: # if(HybridI_TPrimeIsAnnotated[k] == True): # continue # elif(HybridI_TPrimeIsAnnotated[k] == False): # HybridI_TPrimeIsAnnotated[k] = Emb_TPrimeIsAnnotated[k] # HybridI_TPrimeAnnotation[k] = Emb_TPrimeAnnotation[k] # print("Hybrid I result:") # print("-"*30) # print(HybridI_TPrimeAnnotation) # print(HybridI_TPrimeIsAnnotated) # #metrics: # HybridI_TP, HybridI_FN, HybridI_FP = MCal.MetricsCalcul(T,table_csv,HybridI_TPrimeAnnotation,HybridI_TPrimeIsAnnotated) # HybridI_TP_Total = HybridI_TP_Total+HybridI_TP # HybridI_FN_Total = HybridI_FN_Total+HybridI_FN # HybridI_FP_Total = HybridI_FP_Total+HybridI_FP # HybridI_precision, HybridI_recall, HybridI_F1 = PRF.Precision_Recall_F1(HybridI_TP_Total,HybridI_FN_Total,HybridI_FP_Total) #---------------------------------------------------------------- #uncomment for Hybrid II(uncomment also embedding and RLU) #---------------------------------------------------------------- # #Hybrid2 # HybridII_TPrimeIsAnnotated = Emb_TPrimeIsAnnotated # HybridII_TPrimeAnnotation = Emb_TPrimeAnnotation # for k in HybridII_TPrimeIsAnnotated: # if(HybridII_TPrimeIsAnnotated[k] == True): # continue # elif(HybridII_TPrimeIsAnnotated[k] == False): # HybridII_TPrimeIsAnnotated[k] = RLU_TPrimeIsAnnotated[k] # HybridII_TPrimeAnnotation[k] = RLU_TPrimeAnnotation[k] # print("Hybrid II result:") # print("-"*30) # print(HybridII_TPrimeAnnotation) # print(HybridII_TPrimeIsAnnotated) # #metrics: # HybridII_TP, HybridII_FN, HybridII_FP = MCal.MetricsCalcul(T,table_csv,HybridII_TPrimeAnnotation,HybridII_TPrimeIsAnnotated) # HybridII_TP_Total = HybridII_TP_Total+HybridII_TP # HybridII_FN_Total = HybridII_FN_Total+HybridII_FN # HybridII_FP_Total = HybridII_FP_Total+HybridII_FP # HybridII_precision, HybridII_recall, HybridII_F1 = PRF.Precision_Recall_F1(HybridII_TP_Total,HybridII_FN_Total,HybridII_FP_Total) #------------------------------------------------------------------------------------------ # print("Final result of the core partition") # print("-"*30) # #---------------------------------------------------------------- # print("Final Refined Lookup:") # print("Final RLU Precision : ", RLU_precision ) # print("Final RLU Recall : ",RLU_recall) # print("Final RLU F1 : ",RLU_F1) # print("-"*30) # print("Final RLU FN ",RLU_FN_Total) # print("Final RLU FP ",RLU_FP_Total) # print("Final RLU TP ",RLU_TP_Total) # print("-"*30) #---------------------------------------------------------------- print("Embedding:") print("Final Embedding Precision : ", Emb_precision) print("Final Embedding Recall : ", Emb_recall) print("Final Embedding F1 : ", Emb_F1) print("-" * 30) print("Final Embedding FN ", Emb_FN_Total) print("Final Embedding FP ", Emb_FP_Total) print("Final Embedding TP ", Emb_TP_Total) print("-" * 30) #---------------------------------------------------------------- # print("Hybrid I:") # print("Final Hybrid I Precision : ", HybridI_precision ) # print("Final Hybrid I Recall : ",HybridI_recall) # print("Final Hybrid I F1 : ",HybridI_F1) # print("-"*30) # print("Final Hybrid I FN ",HybridI_FN_Total) # print("Final Hybrid I FP ",HybridI_FP_Total) # print("Final Hybrid I TP ",HybridI_TP_Total) # print("-"*30) #---------------------------------------------------------------- # print("Hybrid II:") # print("Final Hybrid II Precision : ", HybridII_precision ) # print("Final Hybrid II Recall : ",HybridII_recall) # print("Final Hybrid II F1 : ",HybridII_F1) # print("-"*30) # print("Final Hybrid II FN ",HybridII_FN_Total) # print("Final Hybrid II FP ",HybridII_FP_Total) # print("Final Hybrid II TP ",HybridII_TP_Total) # print("-"*30) # print("-----------------------------------------------------------------------------------") #print("Run time for 1 lookup phase and one embedding phase, Local Endpoint ",time.time() - start_time) # print("-----------------------------------------------------------------------------------") return
if m <= 0 : raise ValueError("Requires m to be positive and greater than 0") if t<= 0 : raise ValueError("Requires t to be positive and greater from 0") if e<0: raise ValueError("Requires eps to be positive") if distance != 'euclidean' and distance != 'maximum' and distance !='manhattan': raise ValueError("Requires a valid way to compute distance") except ValueError, err_msg: raise ValueError(err_msg) return if standardization==True: x=Standardize.Standardize(x) y=Standardize.Standardize(y) if (m!=1) or (t!=1): x=Embedding.Embedding(x,m,t) y=Embedding.Embedding(y,m,t) vd=2 if(distance=='euclidean'): pass elif(distance=='manhattan'): vd=1 elif(distance=='maximum'): vd=np.inf crp_tmp=np.ones((x.shape[0],y.shape[0])) for i in range(0,x.shape[0]): x_row_rep_T=pd.concat([x.iloc[i,:]]*y.shape[0],axis=1,ignore_index=True)
def queryVec(query): q = Embedding.queryEncoder(query) q = BiGRU.queryGRU(q) return q
def AllCalculate(clause_weight, phrase_weight, word_weight, words, word_emb): prefix = "data/datapre/" farr1 = [ "MSRpar2012-1.txt", #"MSRpar2012-2.txt", "MSRvid2012-1.txt", #"MSRvid2012-2.txt", "OnWN2012-1.txt", #"OnWN2012-2.txt", "OnWN2013-1.txt", #"OnWN2013-2.txt", "OnWN2014-1.txt", #"OnWN2014-2.txt", "SMTeuro2012-1.txt", #"SMTeuro2012-2.txt", "SMTnews2012-1.txt", # 4 #"SMTnews2012-2.txt", "FNWN2013-1.txt", #"FNWN2013-2.txt", "SMT2013-1.txt", #"SMT2013-2.txt", "headline2013-1.txt", # 8 #"headline2013-2.txt", "headline2014-1.txt", # 8 #"headline2014-2.txt", "headline2015-1.txt", # 8 #"headline2015-2.txt", "deft-forum2014-1.txt", # "deft-forum2014-2.txt", "deft-news2014-1.txt", # "deft-news2014-2.txt", "images2014-1.txt", #"images2014-02.txt", "images2015-1.txt", # 19 # "images2015-2.txt", "tweet-news2014-1.txt", # 14 # "tweet-news2014-2.txt", "answer-forum2015-1.txt", # "answer-forum2015-2.txt", "answer-student2015-1.txt", # "answer-student2015-2.txt", "belief2015-1.txt", #"belief2015-2.txt", "sicktest-1.txt", # "sicktest-2.txt", "twitter-1.txt" ] #"twitter-2.txt"] farr2 = [ # "MSRpar2012-1.txt", "MSRpar2012-2.txt", # "MSRvid2012-1.txt", "MSRvid2012-2.txt", # "OnWN2012-1.txt", "OnWN2012-2.txt", # "OnWN2013-1.txt", "OnWN2013-2.txt", # "OnWN2014-1.txt", "OnWN2014-2.txt", #"SMTeuro2012-1.txt", "SMTeuro2012-2.txt", # "SMTnews2012-1.txt", # 4 "SMTnews2012-2.txt", # "FNWN2013-1.txt", "FNWN2013-2.txt", # "SMT2013-1.txt", "SMT2013-2.txt", # "headline2013-1.txt", # 8 "headline2013-2.txt", #"headline2014-1.txt", # 8 "headline2014-2.txt", # "headline2015-1.txt", # 8 "headline2015-2.txt", # "deft-forum2014-1.txt", "deft-forum2014-2.txt", # "deft-news2014-1.txt", "deft-news2014-2.txt", # "images2014-1.txt", "images2014-02.txt", # "images2015-1.txt", # 19 "images2015-2.txt", # "tweet-news2014-1.txt", # 14 "tweet-news2014-2.txt", # "answer-forum2015-1.txt", "answer-forum2015-2.txt", # "answer-student2015-1.txt", "answer-student2015-2.txt", # "belief2015-1.txt", "belief2015-2.txt", # "sicktest-1.txt", "sicktest-2.txt", # "twitter-1.txt", "twitter-2.txt" ] #"JHUppdb", #"anno-dev", farr_score = [ "MSRpar2012-score.txt", #"MSRpar2012-2.txt", #"MSRvid2012", "MSRvid2012-score.txt", #"MSRvid2012-2.txt", "OnWN2012-score.txt", #"OnWN2012-2.txt", "OnWN2013-score.txt", #"OnWN2013-2.txt", "OnWN2014-score.txt", #"OnWN2014-2.txt", "SMTeuro2012-score.txt", #"SMTeuro2012-2.txt", "SMTnews2012-score.txt", # 4 #"SMTnews2012-2.txt", "FNWN2013-score.txt", #"FNWN2013-2.txt", "SMT2013-score.txt", #"SMT2013-2.txt", "headline2013-score.txt", # 8 #"headline2013-2.txt", "headline2014-score.txt", # 8 #"headline2014-2.txt", "headline2015-score.txt", # 8 #"headline2015-2.txt", "deft-forum2014-score.txt", #"deft-forum2014-2.txt", "deft-news2014-score.txt", #"deft-news2014-2.txt", "images2014-score.txt", #"images2014-02.txt", "images2015-score.txt", # 19 #"images2015-2.txt", "tweet-news2014-score.txt", # 14 #"tweet-news2014-2.txt", "answer-forum2015-score.txt", #"answer-forum2015-2.txt", "answer-student2015-score.txt", #"answer-student2015-2.txt", #"answer-student2015", "belief2015-score.txt", #"belief2015-2.txt", "sicktest-score.txt", #"sicktest-2.txt", "twitter-score.txt" ] #"twitter-2.txt"] for file1, file2, score in zip(farr1, farr2, farr_score): sentence_file_1 = prefix + file1 sentence_file_2 = prefix + file2 golds = datapre.getGolds(score) sentence_list_1 = datapre.DataPre(sentence_file_1) sentence_list_2 = datapre.DataPre(sentence_file_2) t0 = time.time() emb1 = Embedding.Embedding(sentence_list_1, clause_weight, phrase_weight, word_weight, words, word_emb) emb2 = Embedding.Embedding(sentence_list_2, clause_weight, phrase_weight, word_weight, words, word_emb) t1 = time.time() print("embedding compute time: %f seconds" % round((t1 - t0), 2)) scores = sim_evaluate(emb1, emb2, golds)
path_emo_lex = r"Lexicons\NRC-Hashtag-Emotion-Lexicon-v0.2\NRC-Hashtag-Emotion-Lexicon-v0.2.txt" path_sen_lex = r"Lexicons\NRC-Hashtag-Sentiment-Lexicon-v1.0\HS-unigrams.txt" # Generating the lexicons lexi = l.load_lexicon(l.datareader(path_emo_lex)) lexi += l.load_lexicon(l.datareader(path_sen_lex, Elex=False), Elex=False) print("Complete") ## # Insert paths/type of embeddings(bert, Glove, Word2vec: Skipgram) path_bert = 'book_corpus_wiki_en_cased' path_glove = r"Thesis - Embeddings\GloVe\glove.6B.300d.w2vformat.txt" path_word2vec = r"Thesis - Embeddings\Word2Vec\GoogleNews-vectors-negative300.bin" # loading the embeding methods glove = em.WordToVec("glove", path_glove) word_two_vec = em.WordToVec("word2vec", path_word2vec) bert = em.Bert("bert", path_bert) embeds = [glove, word_two_vec, bert] ## # Loading aggregation methods n_avg = n_avg all_lexi = lambda x, y: all_lexicon_avg(x, y, lexi[:-1]) esla_sentiment = lambda x, y: lexicon_avg(x, y, lexi[-1]) esla_fear = lambda x, y: lexicon_avg(x, y, lexi[1]) agg_methods = { "NormalAvg": n_avg, "HashAvg": hashtag, "All_lexi": all_lexi, "Emotion_specific": esla_fear,
max_fatures = 2000 tokenizer = Tokenizer(num_words=max_fatures, split=' ') tokenizer.fit_on_texts(df['Statement'].values) X = tokenizer.texts_to_sequences(df['Statement'].values) X = pad_sequences(X) # In[ ]: embed_dim = 128 lstm_out = 196 model = Sequential() model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) model.add(Dropout(0.5)) model.add(LSTM(128,dropout=0.4, recurrent_dropout=0.4,return_sequences=True)) model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False)) model.add(Dense(2,activation='sigmoid',kernel_initializer='glorot_normal')) model.compile(loss = 'categorical_crossentropy', optimizer='Nadam',metrics = ['accuracy']) print(model.summary()) # In[ ]: Y = pd.get_dummies(df['Label']).values X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42) print(X_train.shape,Y_train.shape) print(X_test.shape,Y_test.shape)
#step=FLAGS.step #n_features = FLAGS.n_features_embedding data1 = GetData(data_path + str(2 * step) + "/") print(data_path + str(2 * step) + "/") cluster1_mean = np.zeros(n_features) for d in data1: cluster1_mean += Embedding(d) cluster1_mean /= len(data1) data2 = GetData(data_path + str(2 * step + 1) + "/") cluster2_mean = np.zeros(n_features) for d in data2: cluster2_mean += Embedding(d) cluster2_mean /= len(data2) if not os.path.exists(result_path): os.makedirs(result_path) np.savetxt(result_path + str(2 * step) + "_" + str(2 * step + 1) + ".txt", cluster2_mean - cluster1_mean) np.savetxt(result_path + str(2 * step) + ".txt", cluster1_mean) np.savetxt(result_path + str(2 * step + 1) + ".txt", cluster2_mean) if FLAGS.recieve_vector_of_difference_between_clusters: n_features = FLAGS.n_features for f in range(n_features): Get_vector_of_difference_between_clusters(FLAGS.data_path, FLAGS.result_path, f + 1, 10, Embedding(FLAGS.embedding))
def JointRecurrencePlot(x, y, m, t, e, distance, standardization=False, plot=False): """ It computes and plots the joint recurrence plot of the uni/multivariate input signal(s) x and y (in pandas DataFrame format). **Reference :** * N. Marwan, M. Carmen Romano, M. Thiel and J. Kurths. "Recurrence plots for the analysis of complex systems". Physics Reports 438(5), 2007. :param x: first input signal :type x: pd.DataFrame :param y: second input signal :type y: pd.DataFrame :param m: embedding dimension :type m: int :param t: embedding delay :type t: int :param eps: threshold for recurrence :type eps: float :param distance: It specifies which distance method is used. It can assumes the following values:\n 1. 'euclidean'; 2. 'maximum'; 3. 'manhattan' 4. 'fixed distance maximum norm' :type distance: str :param standardization: if True data are nomalize to zero mean and unitary variance. Default: False :type standardization: bool :param plot: if True the plot of correlation function is returned. Default: False :type standardization: bool """ ' Raise error if parameters are not in the correct type ' if not (isinstance(x, pd.DataFrame)): raise TypeError("Requires x to be a pd.DataFrame") if not (isinstance(y, pd.DataFrame)): raise TypeError("Requires y to be a pd.DataFrame") if not (isinstance(m, int)): raise TypeError("Requires m to be an integer") if not (isinstance(t, int)): raise TypeError("Requires t to be an integer") if not (isinstance(e, float)): raise TypeError("requires eps to be a float") if not (isinstance(distance, str)): raise TypeError("Requires distance to be a string") if not (isinstance(standardization, bool)): raise TypeError("Requires standardization to be a bool") if not (isinstance(plot, bool)): raise TypeError("Requires plot to be a bool") ' Raise error if parameters do not respect input rules ' if m <= 0: raise ValueError("Requires m to be positive and greater than 0") if t <= 0: raise ValueError("Requires t to be positive and greater from 0") if e < 0: raise ValueError("Requires eps to be positive") if distance != 'euclidean' and distance != 'maximum' and distance != 'manhattan' and distance != 'rr': raise ValueError("Requires a valid way to compute distance") 'Error if x and y have not the same size' if x.shape[0] != y.shape[0]: raise ValueError("The two signals have different length") if standardization: x = Standardize.Standardize(x) y = Standardize.Standardize(y) if m != 1 or t != 1: x = Embedding.Embedding(x, m, t) y = Embedding.Embedding(y, m, t) vd = 2 if distance == 'euclidean': pass elif distance == 'manhattan': vd = 1 elif distance == 'maximum': vd = np.inf size = x.shape[0] crp_x_tmp = np.ones((size, size)) crp_y_tmp = crp_x_tmp.copy() curRange = range(0, size) x = x.reset_index(drop=True) y = y.reset_index(drop=True) if distance != 'rr': for i in curRange: crp_x = _row_rep(x, size, i, vd, e, crp_x_tmp) crp_y = _row_rep(y, size, i, vd, e, crp_y_tmp) else: crp_x = _rr_row_rep(x, size, vd, e, crp_x_tmp, curRange) crp_y = _rr_row_rep(y, size, vd, e, crp_y_tmp, curRange) jrp = (1 - crp_x) * (1 - crp_y) jrp = 1 - jrp result = dict() result['jrp'] = jrp if plot: plt.ion() figure = plt.figure() ax = figure.add_subplot(111) ax.set_xlabel('Time (in samples)') ax.set_ylabel('Time (in samples)') ax.set_title('Joint recurrence matrix') ax.imshow(result['jrp'], plt.cm.binary_r, origin='lower', interpolation='nearest', vmin=0, vmax=1) return result
def __init__(self): self.emb = Embedding()
#"twitter-2.txt"] prints="" rmpc = 1 params = params.params() params.rmpc = rmpc parr=[] for file1,file2,scorefile in zip(farr1,farr2,farr_score): sentence_file_1=prefix+file1 sentence_file_2=prefix+file2 golds=datapre.getGolds(prefix+scorefile) sentence_list_1=datapre.DataPre(sentence_file_1) sentence_list_2=datapre.DataPre(sentence_file_2) emb1=Embedding.Embedding(sentence_list_1,clause_weight,phrase_weight,word_weight,words,word_emb,params) emb2=Embedding.Embedding(sentence_list_2,clause_weight,phrase_weight,word_weight,words,word_emb,params) printstr=data_io.sim_evaluate(emb1,emb2,golds) parr.append(printstr) ############################################# #print(parr) sum2012=0 sum2013=0 sum2014=0 sum2015=0 sick=0 twitter=0 n12=0 n13=0
Y_i = pre.dct_reconstruct(X_l, Y, X_h) signal_wmd[:, i] = Y_i embeded = tf.istt_my(signal_wmd, length=y.shape[0]) return embeded, ns path = 'F:/audio_wm/audio/' all_file = pre.filter_file(path, 'wav') #all_file = ['./audio/batman-5min.wav'] n_dt = 8192 L_n = 32 n_code = 32 # 比特数除以4, 这里比特数是128 p0 = eb.seed_generate(L_n) P = eb.pn_code_generate(16, p0) np.save('F:/audio_wm/data/p0.npy', p0) for filepath in all_file: print("Embedding in " + filepath) aFullFilename = os.path.split(filepath)[-1] filename = aFullFilename.split('.')[0] audio, sr = librosa.load(filepath, sr=44100, mono=False) embedded = np.zeros_like(audio) ns = '841F-4483-3ABE-A5D0-E496-5B23-CFA1-2AD7' if audio.ndim > 1: for j in range(audio.shape[0]): y = audio[j, :] embedded[j, :], ns = embed(y, n_dt, ns, P, n_code=32) else:
n_images = 1000 data = GetData("../CycleGAN_shoes/Toy/shoes_boots_heels_white_black/") data_path0 = "../CycleGAN_shoes/Toy/My_interpretability/0/" data_path1 = "../CycleGAN_shoes/Toy/AutoEncoderByClusterDirection/0/" data_path2 = "../CycleGAN_shoes/Toy/PCA_AutoEncoder/0/" data_path3 = "../CycleGAN_shoes/Toy/PCA_GIST/0/" bar = progressbar.ProgressBar(maxval=n_images, \ widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) data_paths = [data_path0, data_path1, data_path2, data_path3] simuliators = [] for method in range(len(data_paths)): simuliators.append(UserSimuliator (data_paths[method], 2, 5, 3, Embedding("3Features"), "../CycleGAN_shoes/Toy/shoes_boots_heels_white_black/", Embedding("LabelToyShoes"), random.choice(data))) results = np.zeros([len(data_paths), n_iterations, 4]) bar.start() for im in range(n_images): bar.update(im + 1) im_ = random.choice(data) for simuliator in range(len(simuliators)): simuliators[simuliator].put_chosen_picture(im_) for it in range(n_iterations): results[simuliator][it] += simuliators[simuliator].one_iteration() bar.finish() for simuliator in range(len(simuliators)):
def docVec(doc): d = Embedding.docEncoder(doc) d = BiGRU.documentGRU(d) return d
import math #%% embedding_dim = 100 offset = 2 # %% # read dataset and dictionary data_train = pd.read_csv('../dataset/Train.csv') X_train = data_train['TEXT'].values Y_train = data_train['Label'].values Y_train = to_categorical(Y_train) emoji_map = pd.read_csv('../dataset/Mapping.csv') data_test = pd.read_csv('../dataset/Test.csv') X_test = data_test['TEXT'].values # remove special symbols and stopwords from train set X_rm = em.corpus_pre(X_train) # segmentation rm_symbols = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' tokenizer = Tokenizer( filters=rm_symbols, split=" ", lower=True ) # filters:filter symbols that need to be removed lower:convert to lowercase tokenizer.fit_on_texts( X_rm ) # Tokenizer read train set free of special symbols. Results are stored in tokenize handle. l2 = math.ceil(sum([len(s.split(" ")) for s in X_rm]) / len(X_rm)) X_pd, tokenizer = mo.toknz(X_rm, l2 + offset, tokenizer) ind_dict = tokenizer.word_index # %% X_seq = [] for sentence in X_rm:
subprocess.call(["cp", im1, result_dir2 + str(index) + ".jpg"]) index += 1 if FLAGS.create_real_pairs: #n_features_embedding, step, result_dir step = FLAGS.step result_dir = FLAGS.result_dir data_path1 = result_dir + str(2 * step) + "/" data_path2 = result_dir + str(2 * step + 1) + "/" result = result_dir + str(2 * step) + "_" + str(2 * step + 1) + "/" result_dir1 = result_dir + "test_AtoB_" + str(step) + "/" result_dir2 = result_dir + "test_BtoA_" + str(step) + "/" BuildPairsFromRealImages(data_path1, data_path2, result, result_dir1, result_dir2, Embedding(FLAGS.embedding_name)) if FLAGS.check_ranking_for_the_clusters: def QualityBinaryClassifier(result): #result_format: [[score, truth]] result.sort(key=lambda x: x[0]) result = np.array(result) n_ones_total = np.sum(result[:, 1]) print(result.shape, n_ones_total) binary_cls_result = 0. n_ones_current = 0. for i in range(result.shape[0]): if (result[i][1] > 0.5): n_ones_current += 1. current_quality = (