def val(epoch, validation_dataframe, embed_dict, dual_encoder, optimizer, loss_func, device): shuffle_dataframe(validation_dataframe) validation_correct_count = 0 sum_loss_validation = 0.0 dual_encoder.eval() for index, row in validation_dataframe.iterrows(): if row["Label"] == 1: row = { "Context": row["Context"], "Utterance": row["Correct"], "Label": 1 } else: row = { "Context": row["Context"], "Utterance": random.choice(list(row)[3:]), "Label": 0 } context = get_embeddings(embed_dict, row['Context']) response = get_embeddings(embed_dict, row['Utterance']) label = float(row['Label']) context = autograd.Variable(torch.FloatTensor(context).view( len(context), 1, -1), requires_grad=False).to(device) response = autograd.Variable(torch.FloatTensor(response).view( len(response), 1, -1), requires_grad=False).to(device) label = autograd.Variable( torch.FloatTensor(torch.from_numpy(np.array(label).reshape( 1, 1)))).to(device) score = dual_encoder(context, response) loss = loss_func(score, label) sum_loss_validation += loss.data validation_correct_count = increase_count(validation_correct_count, torch.sigmoid(score), label) validation_accuracy = validation_correct_count / len(validation_dataframe) val_loss = sum_loss_validation / len(validation_dataframe) return validation_accuracy, val_loss
def setup_runtime(self, ref_samples): self.model.eval() self.metric_fc.eval() # create reference self.create_ref_dataset(ref_samples) self.ref_embs = get_embeddings(self.model, self.dl['ref'], self.device) self.logger.info('Calculated reference embeddings.')
def predict(self, test_samples, test_labels=None, return_raw=False): self.create_test_dataset(test_samples, ['good']*len(test_samples) if test_labels is None else test_labels) test_embs = get_embeddings(self.model, self.dl['test'], self.device, return_y=False) sample_distances = n_by_m_distances(test_embs, self.ref_embs) if return_raw: return sample_distances.min(axis=-1), sample_distances return sample_distances.min(axis=-1)
def build_graph(config): word2idx, idx2word = get_vocabs(config['vocab_file']) embeddings = get_embeddings(word2idx, config['s2v_file']) weights = config.get('weights', [1 for _ in config['metrics']]) assert len(config['metrics']) == len(weights) metrics = {m: {'weight': w} for m, w in zip(config['metrics'], weights)} if 'lm' in metrics: metrics['lm'].update( dict(forward=config['lm_save_dir'], reverse=config.get('lm_rev_save_dir', None), num_words=len(word2idx))) if 'cos' in metrics: idf_file = config.get('idf_file', None) if idf_file is not None: metrics['cos'].update( dict(idf=get_idf_vector(idf_file, word2idx), embeddings=embeddings)) else: metrics['cos'].update(dict(embeddings=embeddings)) sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) sess = tf.Session(config=sess_config) model_inputs, model_outputs = get_model(metrics, mode=config['mode']) if 'lm' in metrics: init_lm_checkpoints(metrics['lm']) sess.run(tf.global_variables_initializer()) return sess, model_inputs, model_outputs, embeddings, word2idx, idx2word
def train(epoch, training_dataframe, embed_dict, dual_encoder, optimizer, loss_func, device): shuffle_dataframe(training_dataframe) sum_loss_training = 0.0 training_correct_count = 0 dual_encoder.train() for index, row in training_dataframe.iterrows(): if row["Label"] == 1: row = { "Context": row["Context"], "Utterance": row["Correct"], "Label": 1 } else: row = { "Context": row["Context"], "Utterance": random.choice(list(row)[3:]), "Label": 0 } context = get_embeddings(embed_dict, row['Context']) response = get_embeddings(embed_dict, row['Utterance']) label = row['Label'] label = np.array(label).astype(np.float32) context = autograd.Variable(torch.FloatTensor(context).view( len(context), 1, -1), requires_grad=False).to(device) response = autograd.Variable(torch.FloatTensor(response).view( len(response), 1, -1), requires_grad=False).to(device) label = autograd.Variable(torch.FloatTensor( torch.from_numpy(np.array(label).reshape(1, 1))), requires_grad=False).to(device) score = dual_encoder(context, response) loss = loss_func(score, label) sum_loss_training += loss.data loss.backward() optimizer.step() optimizer.zero_grad() training_correct_count = increase_count(training_correct_count, torch.sigmoid(score), label) training_accuracy = training_correct_count / len(training_dataframe) train_loss = sum_loss_training / len(training_dataframe) return training_accuracy, train_loss
def update_database(in_path, image, name): with open(in_path, "rb") as pkl_in: database = pickle.load(pkl_in) embeddings_set, id_to_name = database if name: embeddings, _ = get_embeddings(image) else: embeddings, name = get_embeddings(image) if embeddings is not None: embeddings_set = torch.cat( (embeddings_set, embeddings.reshape(1, 1, -1)), dim=0) id_to_name[len(id_to_name)] = name database = [embeddings_set, id_to_name] with open(in_path, "wb") as pkl_out: pickle.dump(database, pkl_out)
def checkImages(): try: databaseName = request.args.get('databaseName') imageId = request.args.get('imageId') #selecting important information from the user request #Accuring the database images based on the request #img1 #img2 or vector2 face1 = get_face_image(img1) face2 = get_face_image(img2) #can be skipped by the inclusion of the vectors time saved - 1s vector1 = get_embeddings(face1) vector2 = get_embeddings(face2) #getting the distance (distance measure can be changed) distOfImages = calc_dist(vector1, vector2) #getting the match score of the image score = get_match_score(distOfImages) #passing this as a api response to the client return jsonify({"Score": score}) except: return jsonify({'trace': traceback.format_exc()})
def create_database(in_path, out_path): images_list = os.listdir(in_path) embeddings_set = torch.rand(len(images_list), 1, 512) id_to_name = {} for i, image in enumerate(images_list): embeddings, name = get_embeddings(os.path.join(in_path, image)) if embeddings is not None: embeddings_set[i] = embeddings id_to_name[i] = name database = [embeddings_set, id_to_name] with open(out_path, "wb") as pkl_out: pickle.dump(database, pkl_out)
def findBest(self, utterance, options): """ finds the best utterance out of all those given in options :param utterance: a single string :param options: a sequence of strings :return: returns one of the strings of options """ self.dual_encoder.eval() device = torch.device('cuda:0' if torch.cuda.is_avaliable() else 'cpu') scores = [] context_embed = get_embeddings(self.embed_dict, utterance) context = autograd.Variable(torch.FloatTensor(context_embed).view( len(context_embed), 1, -1), requires_grad=False).to(device) for answer in options: response = get_embeddings(self.embed_dict, answer) response = autograd.Variable(torch.FloatTensor(response).view( len(response), 1, -1), requires_grad=False).to(device) score = self.dual_encoder(context, response) scores.append(score) pred = np.argmax(scores) # pick the answer with the highest score return options[pred]
def prepare(args, config): word2idx, idx2word = get_vocabs(args.vocab_file) try: embeddings = get_embeddings(word2idx, args.w2v_file) except FileNotFoundError: logging.info( 'embedding file not found. Train embeddings from scratch instead') embeddings = None with tf.variable_scope('LanguageModel'): model_inputs, model_outputs = get_model(config, embeddings, len(word2idx)) sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) sess = tf.Session(config=sess_config) sess.run(tf.global_variables_initializer()) return word2idx, model_inputs, model_outputs, sess
def visualize_after_eval(self, values, test_files, test_labels, test_y_trues): auc, pauc, norm_threshs, norm_factor, scores, raw_scores = values self.logger.debug(f'# of test files: {len(test_files)}') self.logger.debug('distribution' + str(get_class_distribution(test_labels))) self.logger.info(f'AUC = {auc}') # get worst test info test_anomaly_idx = np.where(test_y_trues)[0] scores_anomaly = scores[test_anomaly_idx] worst_test_idxs = test_anomaly_idx[scores_anomaly.argsort()[:self.n_mosts]] worst_test_info = self.get_test_xx_most_info(worst_test_idxs, raw_scores, self.ds['ref'], self.ds['test']) # visualize embeddings classes = sorted(list(set(test_labels))) classes = ['good'] + [l for l in classes if l != 'good'] test_embs = get_embeddings(self.model, self.dl['test'], self.device, return_y=False) visualize_embeddings(title='Class embeddings distribution', embeddings=test_embs, ys=[classes.index(label) for label in test_labels], classes=classes) plt.show() # Best/Worst cases per class for cls in classes: test_mask = [label == cls for label in test_labels] test_idx = np.where(test_mask)[0] scores_cls = scores[test_mask] class_worst_test_idxs = test_idx[scores_cls.argsort()[:self.n_mosts]] worst_test_info = self.get_test_xx_most_info(class_worst_test_idxs, raw_scores, self.ds['ref'], self.ds['test']) class_best_test_idxs = test_idx[scores_cls.argsort()[::-1][:self.n_mosts]] best_test_info = self.get_test_xx_most_info(class_best_test_idxs, raw_scores, self.ds['ref'], self.ds['test']) if cls == 'good': worst_test_info, best_test_info = best_test_info, worst_test_info self.show_test_matching_images('Best: ' + cls, best_test_info) plt.show() self.show_test_matching_images('Worst: ' + cls, worst_test_info) plt.show()
def prepare_decoder(self, targets): """Prepares targets for transformer decoder.""" shape = utils.shape_list(targets) # sequence should be [batch, seq_length] assert len(shape) == 2, 'Sequence tensors should be 2-dimensional' assert (len(self.hparams.query_shape) == 1 ), 'query shape should be 1-dimensional' # Mask random positions if self.hparams.target_dropout: targets = tf.where( tf.random.uniform(shape) < self.hparams.target_dropout, tf.zeros_like(targets), targets, ) # Shift positions targets = tf.expand_dims(targets, axis=-1) targets = utils.right_shift_blockwise_nd(targets, self.hparams.query_shape) targets = tf.squeeze(targets, axis=-1) # Add token embeddings targets = utils.get_embeddings( targets=targets, hidden_size=self.hparams.embedding_dims, vocab_size=self.vocab_size, ) if self.hparams.dropout: targets = tf.nn.dropout(targets, 1 - self.hparams.dropout) targets = tf.layers.dense(targets, self.hidden_size, activation=None, name='emb_dense') if self.hparams.add_timing_signal: targets += utils.get_timing_signal_1d( self.hparams.max_target_length, self.hidden_size) return targets
def train(FLAGS): """ Train our embeddings. """ # Get data loaders print ("==> Reading and processing the data ... ", end="") train_loader, test_loader, num_unique_words, \ num_unique_documents, word_to_idx = process_data( data_dir=FLAGS.data_dir, vocab_size=FLAGS.vocab_size, window_size=FLAGS.window_size, split_ratio=FLAGS.split_ratio, batch_size=FLAGS.batch_size, ) print ("[COMPLETE]") # Load pretrained GloVe embeddings for our vocab embedding_dir = os.path.join(basedir, "../../../../embeddings/glove") embedding_dim = 100 embeddings = get_embeddings( embedding_dir=embedding_dir, embedding_dim=embedding_dim, words=word_to_idx.keys(), ) # Initialize model, criterion, loss print ("==> Initializing model components ... ", end="") model = MLP( D_in_words=num_unique_words, D_in_documents=num_unique_documents, embedding_dim=FLAGS.embedding_dim, num_hidden_units=FLAGS.num_hidden_units, window_size=FLAGS.window_size, embeddings=embeddings, ) # Objective criterion = torch.nn.CrossEntropyLoss() # Optimizer # Only get the parameters with gradients (we freeze our GloVe embeddings) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=FLAGS.lr) print ("[COMPLETE]") # Train the model print ("==> Training the model ... [IN PROGRESS]") model = training_procedure( model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, test_loader=test_loader, num_epochs=FLAGS.num_epochs, learning_rate=FLAGS.lr, decay_rate=FLAGS.decay_rate, max_grad_norm=FLAGS.max_grad_norm, log_every=FLAGS.log_every, ) print ("\n[COMPLETE]") # Save the model print ("==> Saving the model ... [IN PROGRESS]") torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt")) print ("\n[COMPLETE]")
def generate_emb(): global all_data, best_embeddings, best_model, all_hyperparameters, curr_hyperparameters, best_hyperparameters, all_losses, device all_data = preprocess.read_graphs(str(config.DATASET_DIR / "edge_list.txt")) # Iterate through hyperparameter type (shuffled) shuffled_param_type = random.sample(all_hyperparameters.keys(), len(all_hyperparameters.keys())) for param_type in shuffled_param_type: # Iterate through hyperparameter values of the specified type (shuffled) shuffled_param_val = random.sample( all_hyperparameters[param_type], len(all_hyperparameters[param_type])) for param_val in shuffled_param_val: # Initiate current hyperparameter curr_hyperparameters[param_type] = param_val print(curr_hyperparameters) log_f.write(str(curr_hyperparameters) + "\n") # Set up model = mdl.TrainNet(all_data.x.shape[1], curr_hyperparameters['hidden'], curr_hyperparameters['output'], config.CONV.lower().split("_")[0], curr_hyperparameters['dropout']).to(device) optimizer = torch.optim.Adam( model.parameters(), lr=curr_hyperparameters['lr'], weight_decay=curr_hyperparameters['wd']) # Train model model.train() curr_losses = [] for epoch in range(config.EPOCHS): loss = train(epoch, model, optimizer) curr_losses.append(loss) all_losses[";".join( [str(v) for v in curr_hyperparameters.values()])] = curr_losses # Set up for next hyperparameter curr_hyperparameters[param_type] = best_hyperparameters[param_type] print("Best Hyperparameters: ", best_hyperparameters) print("Optimization finished!") log_f.write("Best Hyperparameters: %s \n" % best_hyperparameters) # Save best embeddings device = torch.device('cpu') best_model = best_model.to(device) best_embeddings = utils.get_embeddings(best_model, all_data, device) # Test test(best_model) # Save best embeddings torch.save(best_embeddings, config.DATASET_DIR / (config.CONV.lower() + "_embeddings.pth"))
'RBR': 39, 'POS': 40, 'PDT': 41, 'UH': 42, 'WP': 43, 'JJ|NN': 44, 'AFX': 45 } dl = Data_Load('train', pipeline, dep_type2id, pos_tag2id) matrix, word2id, id2word, X_dummy, Y, files_len_list = dl.get_X_Y(int(doc_num)) #matrix=torch.tensor(matrix,dtype=torch.long) #X_dummy=torch.tensor(X_dummy,dtype=torch.float) print("load embedding...") embeddings_matrix = utils.get_embeddings( "gensim_glove_vectors" + embed_dim + "d.txt", word2id, int(embed_dim)) print("finally finish loading!!!") START_TAG = "<START>" STOP_TAG = "<STOP>" EMBEDDING_DIM = embeddings_matrix.shape[1] * matrix.shape[1] + X_dummy.shape[1] HIDDEN_DIM = 400 labels = list(set(Y)) not_removed_label = list(set(Y)) print(labels) tag_to_ix = dict(zip(labels, range(len(set(Y))))) id_to_tag = dict(zip(range(len(set(Y))), labels)) tag_to_ix[START_TAG] = len(tag_to_ix) tag_to_ix[STOP_TAG] = len(tag_to_ix)
def get_X_Y(self, max_num=10000): files = os.listdir(self.tag_path) X = [] X_catg = [] X_pos = [] X_pre = [] Y = [] dep_type2id = {} pos_tag2id = {} i = 0 #files=['protocol_615.txt'] for f in files: if i == max_num: break i += 1 print(f) no_affix_f = f[:f.index(".txt")] sent_pos_list = get_sent_pos_list(self.tag_path + f) line_list = get_line_list(self.original_path + f) ngram_lemmas_syn_list = get_ngram_lemmas_syn( sent_pos_list, self.nlp) label_list = get_labels(self.original_path + no_affix_f + ".ann") doc_list = get_line_list(self.original_path + f) len_list = [0] * len(doc_list) for j in range(1, len(doc_list)): len_list[j] = len_list[j - 1] + len(doc_list[j - 1]) len_list.extend([1000000]) len_label_list = [[] for i in range(len(doc_list))] one_len = [] flag = 1 for entityidx, info, words in label_list: #print(info) info = info.split(";")[0] clas, start, end = info.split(" ") slot, num = first(len_list, lambda x: int(start) < x) len_label_list[slot - 1].append((words, clas)) """if int(start)<len_list[flag]: one_len.append((words,clas)) else: flag+=1 len_label_list.append(one_len) one_len=[(words,clas)] if len(len_label_list)<len(doc_list): len_label_list.append([]) """ for idx, sent_list in enumerate(ngram_lemmas_syn_list): #['<START>', '<PAD>', '<PAD>', '<PAD>'], ['SpinSmart', 'NNP', 'SpinSmart'], ['Plasmid', 'plasmid_DNA', '<PAD>', '<PAD>'], ['<PAD>', '<PAD>'], ['Plasmid', 'compound']), (['SpinSmart', '<PAD>', '<PAD>', '<PAD>'], ['Plasmid', 'NNP', 'Plasmid'], ['Purification', 'refining', 'refinement', 'purgation'], ['compound', 'SpinSmart'], ['Purification', 'compound'] for word_feat in sent_list: if word_feat[3][0] not in dep_type2id.keys(): dep_type2id[word_feat[3][0]] = len(dep_type2id) if word_feat[4][1] not in dep_type2id.keys(): dep_type2id[word_feat[4][1]] = len(dep_type2id) if word_feat[1][1] not in pos_tag2id.keys(): pos_tag2id[word_feat[1][1]] = len(pos_tag2id) if word_feat[1][0] == 'a': Y.append('o') else: Y.append( first(len_label_list[idx], lambda x: x[0].find(word_feat[1][0]) >= 0)[1] [1]) x_feat = [] x_feat.extend(word_feat[0]) x_feat.extend([word_feat[1][0], word_feat[1][2]]) x_feat.extend(word_feat[2]) x_feat.extend([word_feat[3][1]]) x_feat.extend([word_feat[4][0]]) x_feat.extend([word_feat[1][1]]) x_feat.extend([word_feat[3][0]]) x_feat.extend([word_feat[4][1]]) X_pre.append(x_feat) for x_feat in X_pre: cat_g = [] deprel = [0] * len(dep_type2id) deprel[dep_type2id[x_feat[-2]]] = 1 govrel = [0] * len(dep_type2id) govrel[dep_type2id[x_feat[-1]]] = 1 cat_g.extend(deprel) cat_g.extend(govrel) pos_feat = [0] * len(pos_tag2id) pos_feat[pos_tag2id[x_feat[-3]]] = 1 x_feat = x_feat[:-3] X.append(x_feat) X_catg.append(cat_g) X_pos.append(pos_feat) X_catg = np.array(X_catg) X_pos = np.array(X_pos) vocab = get_word_vocab.Vocab(X) word2id = vocab.get_word2id() matrix = vocab.get_matrix() print("load embedding...") embeddings_matrix = get_embeddings(self.glove_path, word2id, self.emb_dim) print("finally finish loading!!!") embedding = nn.Embedding(embeddings_matrix.shape[0], embeddings_matrix.shape[1]) embedding.weight = nn.Parameter(embeddings_matrix) embedding.weight.requires_grad = False embedding_X = np.array(embedding(torch.LongTensor(matrix))) embedding_X = embedding_X.reshape( embedding_X.shape[0], embedding_X.shape[1] * embedding_X.shape[2]) #print(embedding_X.shape) final_X = np.concatenate((embedding_X, X_catg, X_pos), axis=1) print(final_X.shape) Y = np.array(Y) print(Y.shape) return final_X, Y
def main(): print("Starting x-ray fine-tuning script at %s" % (str(datetime.datetime.now()))) assert args.split >= 0 and args.split <= 9, "Split number can only be in [0,9]." split_idx = args.split ## Ensure output directory exists if os.path.exists(args.output_dir): pass else: os.makedirs(args.output_dir, exist_ok=True) #------------------- # Load data, setup device #------------------- all_embeddings = utils.get_embeddings("covidx", args.mask, args.model) all_task_labels = utils.get_task_labels("covidx") all_domain_labels = utils.get_domain_labels("covidx") #------------------- # Generate splits with same random numbers #------------------- np.random.seed(args.repetition) torch.manual_seed(args.repetition) train_splits = [] val_splits = [] test_splits = [] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=args.repetition) for i, (train_index, test_index) in enumerate(kf.split(all_embeddings, all_domain_labels)): train_index, val_index = train_test_split(train_index, test_size=0.1, stratify=all_domain_labels[train_index], random_state=args.repetition) train_splits.append(train_index) val_splits.append(val_index) test_splits.append(test_index) for i in range(len(train_splits)): assert len(np.unique(all_task_labels[train_splits[i]])) == 3 assert len(np.unique(all_task_labels[val_splits[i]])) == 3 assert len(np.unique(all_task_labels[test_splits[i]])) == 3 assert len(np.unique(all_domain_labels[train_splits[i]])) == 5 assert len(np.unique(all_domain_labels[val_splits[i]])) == 5 assert len(np.unique(all_domain_labels[test_splits[i]])) == 5 #------------------- # Setup datasets/dataloaders #------------------- scaler = StandardScaler() scaler = scaler.fit(all_embeddings[train_splits[split_idx]]) all_embeddings = scaler.transform(all_embeddings) train_dataset = training_utils.EmbeddingMultiTaskDataset( all_embeddings[train_splits[split_idx]], [all_task_labels[train_splits[split_idx]], all_domain_labels[train_splits[split_idx]]], ) val_dataset = training_utils.EmbeddingMultiTaskDataset( all_embeddings[val_splits[split_idx]], [all_task_labels[val_splits[split_idx]], all_domain_labels[val_splits[split_idx]]], ) test_dataset = training_utils.EmbeddingMultiTaskDataset( all_embeddings[test_splits[split_idx]], [all_task_labels[test_splits[split_idx]], all_domain_labels[test_splits[split_idx]]] ) train_dataloader = torch.utils.data.DataLoader( train_dataset, shuffle=True, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, pin_memory=True, ) train_unshuffled_dataloader = torch.utils.data.DataLoader( train_dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, pin_memory=True, ) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, ) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, ) #------------------- # Training #------------------- mlp = training_utils.MultiTaskMLP(utils.get_model_embedding_sizes(args.model), args.hidden_layer_size, [3,5]) mlp = mlp.to(device) optimizer = optim.AdamW(mlp.parameters(), lr=1e-3, amsgrad=True) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min") criterions = [ nn.CrossEntropyLoss(), # task loss ] if args.use_feature_disentanglement: criterions.append(nn.CrossEntropyLoss()) # domain loss # Losses training_task_losses = [] training_domain_losses = [] validation_task_losses = [] validation_domain_losses = [] # AUCS validation_task_aucs = [] validation_domain_aucs = [] validation_task_accs = [] validation_domain_accs = [] # Other model_checkpoints = [] num_times_lr_dropped = 0 for epoch in range(args.num_epochs): lr = training_utils.get_lr(optimizer) task_lr = args.lr_start / (1+((args.lr_start-1)*(epoch/(args.num_epochs-1))**args.lr_exponent)) training_losses = training_utils.fit( mlp, device, train_dataloader, optimizer, criterions, epoch, task_lr_multiplier=task_lr ) validation_losses = training_utils.evaluate( mlp, device, val_dataloader, criterions, epoch ) ## Record training/validation metrics training_task_losses.append(training_losses[0]) validation_task_losses.append(validation_losses[0]) if len(training_losses) > 1: training_domain_losses.append(training_losses[1]) validation_domain_losses.append(validation_losses[1]) ## Embed the entire training and validation sets train_set_embedding = training_utils.embed(mlp, device, train_unshuffled_dataloader) val_set_embedding = training_utils.embed(mlp, device, val_dataloader) ## Fit near optimal LR model on task labels task_lr_model = LogisticRegression(C=0.001, max_iter=20, random_state=args.repetition) task_lr_model.fit(train_set_embedding, all_task_labels[train_splits[split_idx]]) y_pred_proba = task_lr_model.predict_proba(val_set_embedding) validation_task_auc = roc_auc_score(all_task_labels[val_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr") validation_task_acc = np.mean(utils.get_per_class_accuracies(all_task_labels[val_splits[split_idx]], y_pred_proba.argmax(axis=1))) print("Learned val task AUC:", validation_task_auc) validation_task_aucs.append(validation_task_auc) validation_task_accs.append(validation_task_acc) ## Fit near optimal LR model on domain labels domain_lr_model = LogisticRegression(C=0.001, max_iter=20, random_state=args.repetition) domain_lr_model.fit(train_set_embedding, all_domain_labels[train_splits[split_idx]]) y_pred_proba = domain_lr_model.predict_proba(val_set_embedding) validation_domain_auc = roc_auc_score(all_domain_labels[val_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr") validation_domain_acc = np.mean(utils.get_per_class_accuracies(all_domain_labels[val_splits[split_idx]], y_pred_proba.argmax(axis=1))) print("Learned val dataset AUC:", validation_domain_auc) validation_domain_aucs.append(validation_domain_auc) validation_domain_accs.append(validation_domain_acc) ## Copy near optimal LR model to model mlp.heads[1].weight.data = torch.from_numpy(domain_lr_model.coef_.astype(np.float32)).to(device) mlp.heads[1].bias.data = torch.from_numpy(domain_lr_model.intercept_.astype(np.float32)).to(device) model_checkpoints.append(copy.deepcopy(mlp.state_dict())) ## Early stopping scheduler.step(validation_losses[0]) if training_utils.get_lr(optimizer) < lr: num_times_lr_dropped += 1 print("") print("Learning rate dropped") print("") if num_times_lr_dropped == 3: break #------------------- # Testing #------------------- # Select best model if args.use_feature_disentanglement: best_model_checkpoint = model_checkpoints[np.argmin(validation_domain_aucs)] else: best_model_checkpoint = model_checkpoints[np.argmax(validation_task_aucs)] mlp.load_state_dict(best_model_checkpoint) # Evaluate on test tests y_pred_proba = training_utils.score(mlp, device, test_dataloader, 0) test_task_auc = roc_auc_score(all_task_labels[test_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr") test_task_acc = np.mean(utils.get_per_class_accuracies(all_task_labels[test_splits[split_idx]], y_pred_proba.argmax(axis=1))) print("Test task AUC:", test_task_auc) print("Test task ACC:", test_task_acc) print("") y_pred_proba = training_utils.score(mlp, device, test_dataloader, 1) test_domain_auc = roc_auc_score(all_domain_labels[test_splits[split_idx]], y_pred_proba, average="macro", multi_class="ovr") test_domain_acc = np.mean(utils.get_per_class_accuracies(all_domain_labels[test_splits[split_idx]], y_pred_proba.argmax(axis=1))) print("Test domain AUC:", test_domain_auc) print("Test domain ACC:", test_domain_acc) print("") # Save embeddings if we want to make UMAPs if args.save_embeddings: train_embedding = training_utils.embed(mlp, device, train_unshuffled_dataloader) val_embedding = training_utils.embed(mlp, device, val_dataloader) test_embedding = training_utils.embed(mlp, device, test_dataloader) all_embeddings = np.concatenate([ train_embedding, val_embedding, test_embedding ], axis=0) #------------------- # Save everything #------------------- save_obj = { 'args': args, 'training_task_losses': training_task_losses, 'training_domain_losses': training_domain_losses, 'validation_task_losses': validation_task_losses, 'validation_domain_losses': validation_domain_losses, 'validation_task_aucs': validation_task_aucs, 'validation_task_accs': validation_task_accs, 'validation_domain_aucs':validation_domain_aucs, 'validation_domain_accs': validation_domain_accs, "test_task_auc": test_task_auc, "test_task_acc": test_task_acc, "test_domain_auc": test_domain_auc, "test_domain_acc": test_domain_acc, "checkpoints": model_checkpoints } save_obj_fn = "covidx_%s_%s_%s_split-%d_%d_lr-%0.1f_hls-%d.pkl" % (args.mask, args.model, "disentangle" if args.use_feature_disentanglement else "no-disentangle", args.split, args.repetition, args.lr_start, args.hidden_layer_size) with open(os.path.join(args.output_dir, save_obj_fn), 'wb') as f: pickle.dump(save_obj, f) if args.save_embeddings: save_embedding_fn = "covidx_%s_%s_%s_split-%d_%d_lr-%0.1f_hls-%d.npy" % (args.mask, args.model, "disentangle" if args.use_feature_disentanglement else "no-disentangle", args.split, args.repetition, args.lr_start, args.hidden_layer_size) np.save(os.path.join(args.output_dir, save_embedding_fn), all_embeddings)
def encode_mean_embeddings(string, embedding_map): embeddings = utils.get_embeddings(string, embedding_map) encoded = np.mean(embeddings, axis=0) return Variable(torch.FloatTensor(encoded))
y_test, y_pred_proba.argmax(axis=1), number_classes=number_of_classes )) aucs.append(utils.get_binary_aucs(y_test, y_pred_proba, number_classes=number_of_classes)) accs = np.array(accs) aucs = np.array(aucs) average_aucs = np.mean(aucs, axis=1) average_accs = np.mean(accs, axis=1) results = { "average auc": (np.mean(average_aucs), np.std(average_aucs)), "average acc": (np.mean(average_accs), np.std(average_accs)), } return results models = ["xrv", "histogram", "densenet", "covidnet"] masks = ["masked", "unmasked"] print("Linear model performance discriminating between sub-datasets in the COVIDx dataset from pre-trained embeddings") print("") print("Masking method,Feature extractor model,Average AUC,Average ACC") for mask in masks: for model in models: embeddings = utils.get_embeddings("covidx", mask, model) labels = utils.get_domain_labels("covidx") results = do_experiment(embeddings, labels, number_of_classes=5) print("%s,%s,%0.2f +/- %0.2f,%0.2f +/- %0.2f" % ( mask, model, results["average auc"][0], results["average auc"][1], results["average acc"][0], results["average acc"][1], ))
def main(): print("Starting x-ray inference script at %s" % (str(datetime.datetime.now()))) # Parse out model arguments from the model filename -- we guarantee that our saved models will look like this assert os.path.exists(args.model_fn), "Model file does not exist" model_parts = parse( "covidx_{mask}_{model}_{disentangle}_split-{split}_{repetition}_lr-50.0_hls-64.pkl", os.path.basename(args.model_fn)) masked = model_parts["mask"] == "masked" base_model = model_parts["model"] assert base_model in ["xrv", "densenet"] disentangled = model_parts["disentangle"] == "disentangle" metadata_df = pd.read_csv(args.input_fn) if masked: original_fns = metadata_df["masked_image_path"].values else: original_fns = metadata_df["unmasked_image_path"].values num_samples = original_fns.shape[0] ## Ensure all input files exist for fn in original_fns: assert os.path.exists(fn), "Input doesn't exist: %s" % (fn) file_extension = fn.split(".")[-1] assert file_extension.lower( ) in ALLOWED_FILENAMES, "Input does not have a correct file extension: %s" % ( file_extension) ## Ensure output directory exists output_dir = os.path.dirname(args.output_fn) if os.path.exists(output_dir): if os.path.exists(args.output_fn): if not args.overwrite: print("WARNING: The output file exists, exiting...") return else: os.makedirs(output_dir, exist_ok=True) ## Embed images with whatever the base model is tic = float(time.time()) images = utils.get_images( original_fns ) # these will be the masked versions if we are using a masked model print("Finished loading images in %0.4f seconds" % (time.time() - tic)) tic = float(time.time()) if masked: images = utils.transform_to_equalized(images) if base_model == "xrv": images = utils.transform_to_xrv(images) xrv_model = utils.get_xrv_model(device) embeddings = utils.run_densenet_model(xrv_model, device, images, global_max_pool=False, embedding_size=1024, batch_size=64) elif base_model == "densenet": images = utils.transform_to_standardized(images) densenet_model = utils.get_densenet121(device) embeddings = utils.run_densenet_model(densenet_model, device, images, global_max_pool=False, embedding_size=1024, batch_size=64) else: raise ValueError("Not implemented yet") ## Adjusting for normalization repetition = int(model_parts["repetition"]) split_idx = int(model_parts["split"]) all_embeddings = utils.get_embeddings("covidx", model_parts["mask"], base_model) all_task_labels = utils.get_task_labels("covidx") all_domain_labels = utils.get_domain_labels("covidx") np.random.seed(repetition) torch.manual_seed(repetition) train_splits = [] val_splits = [] test_splits = [] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=repetition) for i, (train_index, test_index) in enumerate( kf.split(all_embeddings, all_domain_labels)): train_index, val_index = train_test_split( train_index, test_size=0.1, stratify=all_domain_labels[train_index], random_state=repetition) train_splits.append(train_index) val_splits.append(val_index) test_splits.append(test_index) scaler = StandardScaler() scaler = scaler.fit(all_embeddings[train_splits[split_idx]]) embeddings = scaler.transform(embeddings) test_dataset = training_utils.EmbeddingMultiTaskDataset( embeddings, [np.zeros(embeddings.shape[0])]) test_dataloader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=64, num_workers=1, pin_memory=True, ) print("Finished embedding images in %0.4f seconds" % (time.time() - tic)) ## Run embeddings through the saved model tic = float(time.time()) with open(args.model_fn, "rb") as f: saved_model_params = pickle.load(f) mlp = training_utils.MultiTaskMLP( utils.get_model_embedding_sizes(base_model), saved_model_params["args"].hidden_layer_size, [3, 5]) mlp = mlp.to(device) if disentangled: best_model_checkpoint = saved_model_params["checkpoints"][np.argmin( saved_model_params["validation_domain_aucs"])] else: best_model_checkpoint = saved_model_params["checkpoints"][np.argmax( saved_model_params["validation_task_aucs"])] mlp.load_state_dict(best_model_checkpoint) test_set_pred_proba = training_utils.score(mlp, device, test_dataloader, 0) test_set_embedding = training_utils.embed(mlp, device, test_dataloader) print("Finished loading/running saved model in %0.4f seconds" % (time.time() - tic)) # save output output_fn = args.output_fn if output_fn.endswith(".npy"): pass else: output_fn += ".npy" np.save(output_fn, test_set_pred_proba) if args.save_embeddings: output_fn = output_fn.replace(".npy", "_embeddings.npy") np.save(output_fn, test_set_embedding)
num_const += 1 else: # var = tf.Variable(init(shape=[dim])) # embed_list.append(var) embed_list.append(embed_list[2]) num_vars += 1 print num_const, num_vars return tf.stack(embed_list, axis=0) if __name__ == '__main__': assert sys.argv[1] in models.keys() print 'using model', sys.argv[1] print 'loading data' start = time() trainset, dev, test, vocab = utils.get_datasets(batch_size=BATCH_SIZE, num_words=VOCAB_SIZE, seq_len=SEQ_LEN) print 'took', time() - start, 'seconds' start = time() print 'getting embeddings' embeddings = utils.get_embeddings(vocab, './glove.6B/glove.6B.300d.txt') print 'took', time() - start, 'seconds' print 'initializing embeddings' start = time() embeddings = init_embeddings(embeddings, vocab, 300) print 'took', time() - start, 'seconds' print 'begin training' train(vocab, embeddings, trainset, dev, test)
def build_model(self, word_index, use_skipgram=True): """Return a compiled Keras model for sentence classification. Parameters ---------- word_index : List of tokens in input data use_skipgram : Boolean, whether to use fasttext skipgram word vectors. If false, cbow model word vectors will be used instead. Returns ------- model : A compiled Keras model for predicting six types of toxicity in a sentencee. attention_layer_model : A Keras model for extracting the attention layer output. """ # conv_filters_1 = 64 # conv_filters_2 = 128 gru_units = [96, 96, 96] dense_units = [64] dropout_prob = 0.3 model_input = Input(shape=(self.num_timesteps, ), name='model_input') embedding_matrix = get_embeddings(word_index=word_index, embedding_dim=self.embedding_dim, use_ft_embeddings=self.use_ft, use_skipgram=use_skipgram) x = Embedding( len(word_index) + 1, # +1 for 0 padding token self.embedding_dim, weights=[embedding_matrix], input_length=self.num_timesteps, trainable=False)(model_input) ''' x = Conv1D(filters=conv_filters_1, kernel_size=3, padding='same', activation='elu')(x) x = BatchNormalization()(x) x = SpatialDropout1D(0.3)(x) x = Conv1D(filters=conv_filters_2, kernel_size=3, padding='same', activation='elu')(x) x = BatchNormalization()(x) x = SpatialDropout1D(0.3)(x) ''' for n in range(len(gru_units)): x = SpatialDropout1D(dropout_prob)(x) x = Bidirectional( CuDNNGRU(units=gru_units[n], return_sequences=True))(x) x = BatchNormalization()(x) x = TimeDistributed(Activation('tanh'))(x) x = SpatialDropout1D(dropout_prob)(x) attention = self._attention_3d_block(inputs=x) dense_input = GlobalMaxPool1D()(attention) if self.use_aux_input: aux_input = Input(shape=(3, ), name='aux_input') dense_input = concatenate([dense_input, aux_input]) for n in range(len(dense_units)): dense = Dropout(dropout_prob)(dense_input) dense = Dense(dense_units[n], activation=None)(dense) dense = BatchNormalization()(dense) dense = Activation('elu')(dense) dense = Dropout(dropout_prob)(dense) probs = Dense(6, activation='sigmoid')(dense) if self.use_aux_input: self.model = Model(inputs=[model_input, aux_input], output=probs) else: self.model = Model(inputs=model_input, output=probs) self.model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) print(self.last_attention_layer_name) self.attention_layer_model = Model( inputs=self.model.input, outputs=self.model.get_layer( self.last_attention_layer_name).output)
def architecture( params, mode, context, context_len, utterance, utterance_len, ): """Return the output operation following the network architecture. Returns: Logits output Op for the network. """ #if mode == TRAIN: # print('TRAIN:::::') # print('context: ',context) # print('context_len: ',context_len) # print('utterance: ',utterance) # print('utterance_len: ',utterance_len) #elif mode == EVAL: # print('EVAL:::::') # print('context: ',context) # print('context_len: ',context_len) # print('utterance: ',utterance) # print('utterance_len: ',utterance_len) # Initialize embedidngs randomly or with pre-trained vectors if available embeddings_W = get_embeddings(params) #context = tf.Print(context,[context], '############ context ##########: ') #utterance = tf.Print(utterance,[utterance],"utterance: ") # Embed the context and the utterance context_embedded = tf.nn.embedding_lookup( embeddings_W, context, name="embed_context") if mode != PREDICT: utterance_embedded = tf.nn.embedding_lookup( embeddings_W, utterance, name="embed_utterance") #print('context_embedded: ',context_embedded) #print('utterance_embedded: ',utterance_embedded) with tf.variable_scope("rnn") as vs: # We use an LSTM Cell #cell = tf.nn.rnn_cell.LSTMCell( # params.rnn_dim, # forget_bias=2.0, # use_peepholes=True) rnn_dims = params.rnn_dim.split(',') cell = [ tf.nn.rnn_cell.LSTMCell( int(rnn_dim), forget_bias=2.0, use_peepholes=True) for rnn_dim in rnn_dims] cell = tf.nn.rnn_cell.MultiRNNCell(cell) cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=FLAGS.input_keep_prob, output_keep_prob=FLAGS.output_keep_prob, state_keep_prob=FLAGS.state_keep_prob) # Run the utterance and context through the RNN #context_len = tf.Print(context_len,[context_len],"context_len: ") #utterance_len = tf.Print(utterance_len, [utterance_len], 'utterance_len: ') if mode != PREDICT: tmp_concat = tf.concat([context_embedded, utterance_embedded],0) tmp_concat_len = tf.concat([context_len, utterance_len],0) else: tmp_concat = context_embedded tmp_concat_len = context_len rnn_outputs, rnn_states = tf.nn.dynamic_rnn( cell, tmp_concat, tmp_concat_len, dtype=tf.float32) if isinstance(rnn_states,list) or isinstance(rnn_states,tuple): rnn_states = rnn_states[0] #context_embedded: Tensor("embed_context:0", shape=(64, 160, 300), dtype=float32) #utterance_embedded: Tensor("embed_utterance:0", shape=(64, 160, 300), dtype=float32) #tf.concat([context_embedded, utterance_embedded]: Tensor("concat:0", shape=(128, 160, 300), dtype=float32) #tf.concat([context_len, utterance_len]: Tensor("concat_1:0", shape=(128,), dtype=int64) if mode != PREDICT: encoding_context, encoding_utterance = tf.split(rnn_states.h,2,0) else: encoding_context = rnn_states.h with tf.variable_scope("prediction") as vs: M = tf.get_variable("M", shape=[FLAGS.last_rnn_dim, FLAGS.last_rnn_dim], initializer=tf.truncated_normal_initializer()) # "Predict" a response: c * M generated_response = tf.matmul(encoding_context, M) print('matmul_weight: ',params.matmul_weight) if mode == PREDICT and not params.matmul_weight: return generated_response generated_response = tf.expand_dims(generated_response, 2) if mode == PREDICT and params.matmul_weight: return generated_response encoding_utterance = tf.expand_dims(encoding_utterance, 2) # Dot product between generated response and actual response # (c * M) * r #logits = tf.batch_matmul(generated_response, encoding_utterance, True) logits = tf.matmul(generated_response, encoding_utterance, True) logits = tf.squeeze(logits, [2]) return logits
def get_embeddings(title, body): return utils.get_embeddings(title), utils.get_embeddings(body)
def run(config, model_name): config = load_yaml(config) if model_name not in config['model']: raise NotImplementedError("{} is not implemented. ".format(model_name)) preprocessing_params = config['preprocessing'] training_params = config['training'] model_params = config['model'][model_name] train_df = pd.read_csv(preprocessing_params['train_path'], sep='\t') test_df = pd.read_csv(preprocessing_params['test_path'], sep='\t') t_list = preprocessing_params['target_list'] model_params['targets'] = len(t_list) train_df['tokens'] = train_df['Tweet'].map(lambda x: tokenize(x)) test_df['tokens'] = test_df['Tweet'].map(lambda x: tokenize(x)) train_df['lengths'] = train_df['tokens'].map(lambda x: len(x)) test_df['lengths'] = test_df['tokens'].map(lambda x: len(x)) word_freq_dict = create_freq_vocabulary( list(train_df['tokens']) + list(test_df['tokens'])) tokens = get_top_freq_words(word_freq_dict, 1) train_df = train_df.sort_values(by="lengths") test_df = test_df.sort_values(by="lengths") embeddings = get_embeddings(path=preprocessing_params['embeddings_path']) w2i = create_final_dictionary(tokens, embeddings, unk_token=preprocessing_params['unk_token'], pad_token=preprocessing_params['pad_token']) emb_matrix = get_embeddings_matrix(w2i, embeddings, preprocessing_params['embedding_size']) model_params['embeddings'] = emb_matrix train_batches = create_batches(train_df, training_params['batch_size'], w2i=w2i, pad_token=preprocessing_params['pad_token'], unk_token=preprocessing_params['unk_token'], target_list=t_list) test_batches = create_batches(test_df, training_params['batch_size'], w2i=w2i, pad_token=preprocessing_params['pad_token'], unk_token=preprocessing_params['unk_token'], target_list=t_list) model = ModelFactory.get_model(model_name, model_params) optimizer = Adam(model.trainable_weights, training_params['lr']) criterion = BCEWithLogitsLoss() train(model, train_batches, test_batches, optimizer, criterion, epochs=training_params['epochs'], init_patience=training_params['patience'], cuda=False, target_list=t_list) model = load_model(model) full_classification_report(model, test_batches, t_list)
def train(FLAGS): """ Train our embeddings. """ # Get data loaders print("==> Reading and processing the data ... ", end="") train_loader, test_loader, num_unique_words, \ num_unique_documents, word_to_idx = process_data( data_dir=FLAGS.data_dir, vocab_size=FLAGS.vocab_size, window_size=FLAGS.window_size, split_ratio=FLAGS.split_ratio, batch_size=FLAGS.batch_size, ) print("[COMPLETE]") # Load pretrained GloVe embeddings for our vocab embedding_dir = os.path.join(basedir, "../../../../embeddings/glove") embedding_dim = 100 embeddings = get_embeddings( embedding_dir=embedding_dir, embedding_dim=embedding_dim, words=word_to_idx.keys(), ) # Initialize model, criterion, loss print("==> Initializing model components ... ", end="") model = MLP( D_in_words=num_unique_words, D_in_documents=num_unique_documents, embedding_dim=FLAGS.embedding_dim, num_hidden_units=FLAGS.num_hidden_units, window_size=FLAGS.window_size, embeddings=embeddings, ) # Objective criterion = torch.nn.CrossEntropyLoss() # Optimizer # Only get the parameters with gradients (we freeze our GloVe embeddings) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=FLAGS.lr) print("[COMPLETE]") # Train the model print("==> Training the model ... [IN PROGRESS]") model = training_procedure( model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, test_loader=test_loader, num_epochs=FLAGS.num_epochs, learning_rate=FLAGS.lr, decay_rate=FLAGS.decay_rate, max_grad_norm=FLAGS.max_grad_norm, log_every=FLAGS.log_every, ) print("\n[COMPLETE]") # Save the model print("==> Saving the model ... [IN PROGRESS]") torch.save(model, os.path.join(basedir, FLAGS.data_dir, "model.pt")) print("\n[COMPLETE]")
def run_validation(model, settings, image_size=128, visual=None, visual_location=None): """Run validation sequence on model. Args: model: Model on which to perform the validation. image_file: Video sequence used for the validation. label_file: Corresponding label file. image_size: Size of the bounding boxes after resize. visual: Visualize the frame with bounding boxes and ids. """ mot_metric = MOTMetric(auto_id=True) # Get the label file. with open(settings.labels_file, 'rb') as file: labels_dict = pickle.load(file) # Open the validation sequence. with h5py.File(settings.images_file, 'r') as sequence: # Loop over every validation sequence for seq in settings.sequences_val: # Create embedding database. embeds_database = EmbeddingsDatabase(settings.memory_length, settings.memory_update) # Loop over every frame in the current sequence for i, frame in enumerate(sequence['seq' + str(seq)]): # Get the ground truth labels for the current frame gt_labels = labels_dict['seq' + str(seq)]['frame' + str(i)] obj_ids, obj_bbs = [], [] for label in gt_labels.values(): obj_ids.append(label['track_id']) obj_bbs.append([ label['left'], label['top'], label['right'], label['bottom'] ]) # Get the embeddings and bouding boxes by running the model if settings.detector: embeddings, boxes, labels, probs = model(frame) hyp_bbs = np.array(boxes, dtype=int) else: embeddings = get_embeddings(model, frame, gt_labels) hyp_bbs = obj_bbs.copy() # Perform the re-identification hyp_ids = embeds_database.match_embeddings( embeddings, settings.max_distance) # Update the MOT metric. mot_metric.update(obj_ids, hyp_ids, np.array(obj_bbs.copy()), np.array( hyp_bbs.copy())) # << CHANGE THIS BACK! if visual == 're-id': # Visualize the frame with bouding boxes and ids. show_frame_with_ids(frame, hyp_bbs.copy(), hyp_ids, frame_num=i, seq_name='seq{}'.format(str(seq)), visual_location=visual_location) elif visual == 'detect': show_frame_with_labels(frame, boxes, labels, probs) # Create gif. if visual == 're-id': # scene_labels = sorted(np.array(os.listdir(scene_label_dir))) loc = '{}/seq{}'.format(visual_location, str(seq)) images = [] filenames = sorted(np.array(os.listdir(loc))) for i in range(len(filenames)): filenames[i] = re.findall(r'\d+', filenames[i])[0] filenames = np.array(filenames, dtype=int) filenames = sorted(filenames) for i in range(len(filenames)): filenames[i] = loc + '/frame' + str(filenames[i]) + '.jpg' for filename in filenames: images.append(imageio.imread(filename)) imageio.mimsave(loc + 'movie.gif', images, duration=0.10) if settings.print_embed_avg: print('Average embedding cost sequence {}: {:.3f}'.format( str(seq), embeds_database.get_average_cost())) # Return the MOT metric object return mot_metric
print("Nb test queries: {}".format(len(q_test))) # Load gold hypernyms (train and dev only) print("Loading gold hypernyms...") path_h_train = "{}/training/gold/{}.training.gold.txt".format( args.dir_datasets, dataset_name_exp) path_h_dev = "{}/trial/gold/{}.trial.gold.txt".format( args.dir_datasets, dataset_name_exp) h_train = utils.load_hypernyms(path_h_train, normalize=True) h_dev = utils.load_hypernyms(path_h_dev, normalize=True) print("Nb training pairs: {}".format(sum(len(x) for x in h_train))) print("Nb dev pairs: {}".format(sum(len(x) for x in h_dev))) # Load word embeddings print("Loading pre-trained word embeddings...") embed_vocab_list, word2vec = utils.get_embeddings(args.path_embeddings, np.float32) embed_vocab_set = set(embed_vocab_list) print("Nb embeddings: {}".format(len(embed_vocab_list))) # Check for candidates that don't have a pre-trained emedding print("Checking for candidates that don't have a pre-trained embedding...") oov_candidates = set(c for c in candidates if c not in embed_vocab_set) print("Nb candidates without a pre-trained embedding: {}".format( len(oov_candidates))) if len(oov_candidates): print("WARNING: {} candidates will be assigned a random embedding.". format(len(oov_candidates))) # Check for queries that don't have a pre-trained embedding print( "Checking for training queries that don't have a pre-trained embedding..."