def convert_leftright(self, part: pd.DataFrame, text_key: str, length_text_key: str, raw_text_key: str, **kargs): """ Converting the dataframe of interactions """ ids, contents_dict, lengths_dict, position_dict = [], {}, {}, {} raw_content_dict = {} FileHandler.myprint( "[NOTICE] MatchZoo use queryID and docID as index in dataframe left and right, " "therefore, iterrows will return index which is left_id or right_id" ) for index, row in part.iterrows(): ids.append(index) text_ = row[ text_key] # text_ here is converted to numbers and padded raw_content_dict[index] = row[raw_text_key] if length_text_key not in row: length_ = len(text_) else: length_ = row[length_text_key] assert length_ != 0 assert index not in contents_dict contents_dict[index] = text_ lengths_dict[index] = length_ position_dict[index] = np.pad( np.arange(length_) + 1, (0, len(text_) - length_), 'constant') return np.array( ids), contents_dict, lengths_dict, raw_content_dict, position_dict
def load_best_model_test2_test3(self, test2: interactions.MatchInteraction, test3: interactions.MatchInteraction, topN: int): mymodel = self._net # print("Trained model: ", mymodel.out.weight) mymodel.load_state_dict(torch.load(self.saved_model)) mymodel.train(False) my_utils.gpu(mymodel, self._use_cuda) assert len(test2.unique_queries_test) in KeyWordSettings.QueryCountTest result_test2, error_analysis_val = self.evaluate(test2, topN, output_ranking=True) hits_test2 = result_test2["hits"] ndcg_test2 = result_test2["ndcg"] ndcg_at_1_test2 = result_test2["ndcg@1"] FileHandler.save_error_analysis_test2( json.dumps(error_analysis_val, sort_keys=True, indent=2)) FileHandler.myprint( 'Best Test2_hard hits@%d = %.5f | Best Test2_hard ndcg@%d = %.5f ' '|Best Test2_hard ndcg@1 = %.5f ' % (topN, hits_test2, topN, ndcg_test2, ndcg_at_1_test2)) return hits_test2, ndcg_test2
def load_best_model_single( self, target_interactions: interactions.MatchInteraction, topN: int): """ Note: This function is used for Heat map visualization only. """ mymodel = self._net # print("Trained model: ", mymodel.out.weight) mymodel.load_state_dict(torch.load(self.saved_model)) mymodel.train(False) my_utils.gpu(mymodel, self._use_cuda) # assert len(val_interactions.unique_queries_test) in KeyWordSettings.QueryCountVal result_val, error_analysis_val = self.evaluate(target_interactions, topN, output_ranking=True) hits = result_val["hits"] ndcg = result_val["ndcg"] ndcg_at_1 = result_val["ndcg@1"] FileHandler.save_error_analysis_validation( json.dumps(error_analysis_val, sort_keys=True, indent=2)) # FileHandler.save_error_analysis_testing(json.dumps(error_analysis_test, sort_keys = True, indent = 2)) FileHandler.myprint( 'Best Target hits@%d = %.5f | Best Target ndcg@%d = %.5f ' '|Best Target ndcg@1 = %.5f' % (topN, hits, topN, ndcg, ndcg_at_1)) return hits, ndcg
def __init__(self, data_pack: matchzoo.DataPack): super(MatchInteraction, self).__init__() FileHandler.myprint( "Converting DataFrame to Normal Dictionary of Data") self.unique_query_ids, \ self.dict_query_contents, \ self.dict_query_lengths, \ self.dict_query_raw_contents, \ self.dict_query_positions, \ self.dict_query_imgages = self.convert_leftright(data_pack.left, text_key = "text_left", length_text_key = "length_left", raw_text_key = "raw_text_left", images_key = "images_left") self.data_pack = data_pack assert len(self.unique_query_ids) == len(set( self.unique_query_ids)), "Must be unique ids" self.unique_doc_ids, \ self.dict_doc_contents, \ self.dict_doc_lengths, \ self.dict_doc_raw_contents, \ self.dict_doc_positions, \ self.dict_doc_imgages = self.convert_leftright(data_pack.right, text_key = "text_right", length_text_key = "length_right", raw_text_key = "raw_text_right", images_key="images_right") assert len(self.unique_doc_ids) == len(set( self.unique_doc_ids)), "Must be unique ids for doc ids" assert len(self.unique_query_ids) != len(self.unique_doc_ids) self.pos_queries, \ self.pos_docs, \ self.negatives, \ self.unique_queries_test = self.convert_relations(data_pack.relation) # for queries, padded self.np_query_contents = np.array( [self.dict_query_contents[q] for q in self.pos_queries]) self.np_query_lengths = np.array( [self.dict_query_lengths[q] for q in self.pos_queries]) self.query_positions = np.array( [self.dict_query_positions[q] for q in self.pos_queries]) self.query_images = np.array( [self.dict_query_imgages[q] for q in self.pos_queries]) # for docs, padded self.np_doc_contents = np.array( [self.dict_doc_contents[d] for d in self.pos_docs]) self.np_doc_lengths = np.array( [self.dict_doc_lengths[d] for d in self.pos_docs]) self.doc_positions = np.array( [self.dict_doc_positions[d] for d in self.pos_docs]) self.doc_images = np.array( [self.dict_doc_imgages[d] for d in self.pos_docs]) assert self.np_query_lengths.shape == self.np_doc_lengths.shape self.padded_doc_length = len(self.np_doc_contents[0]) self.padded_query_length = len(self.np_query_contents[0]) self.padded_doc_images_len = len(self.doc_images[0])
def __init__(self, queries: List[List[str]], corpus: Dict[str, List[str]], params: dict = {}): """docid, value: list of words """ self.queries = queries self.index, self.dlt = build_data_structures(corpus) self.params = params FileHandler.myprint(str(params))
def __init__(self, data_pack: matchzoo.DataPack): FileHandler.myprint( "Converting DataFrame to Normal Dictionary of Data") self.unique_query_ids, \ self.dict_query_contents, \ self.dict_query_lengths, \ self.dict_query_raw_contents = self.convert_left(data_pack.left, text_key ="text_left", length_text_key = "length_left", raw_text_key = "raw_text_left") self.data_pack = data_pack assert len(self.unique_query_ids) == len(set( self.unique_query_ids)), "Must be unique ids" self.unique_doc_ids, \ self.dict_doc_contents, \ self.dict_doc_lengths, \ self.dict_doc_raw_contents, \ self.dict_doc_decoder_input, \ self.dict_doc_decoder_output = self.convert_right(data_pack.right, text_key = "text_right", length_text_key = "length_right", raw_text_key = "raw_text_right") assert len(self.unique_doc_ids) == len(set( self.unique_doc_ids)), "Must be unique ids for doc ids" assert len(self.unique_query_ids) != len(self.unique_doc_ids) self.pos_queries, \ self.pos_docs, \ self.unique_queries_test = self.convert_relations(data_pack.relation) # for queries, padded self.np_query_contents = np.array( [self.dict_query_contents[q] for q in self.pos_queries]) self.np_query_lengths = np.array( [self.dict_query_lengths[q] for q in self.pos_queries]) # for docs, padded self.np_doc_contents = np.array( [self.dict_doc_contents[d] for d in self.pos_docs]) self.np_doc_lengths = np.array( [self.dict_doc_lengths[d] for d in self.pos_docs]) # for docs, padded and prepended with <START> since in decoder's input needs it self.np_doc_decoder_input_contents = np.array( [self.dict_doc_decoder_input[d] for d in self.pos_docs]) # for docs, padded and appended with <EOS> since decoder output's output needs it self.np_doc_decoder_output_contents = np.array( [self.dict_doc_decoder_output[d] for d in self.pos_docs]) assert self.np_query_lengths.shape == self.np_doc_lengths.shape self.padded_doc_length = len(self.np_doc_contents[0]) self.padded_query_length = len(self.np_query_contents[0])
def __init__(self, attn_model, hidden_size: int, output_size: int, embedding_layer, label_embedding_layer, bidirectional: bool = False, n_layers: int = 1, dropout: float = 0.1, use_self_att=False, use_label: bool = False, use_input_feeding: bool = False): super(AttentiveDecoder, self).__init__() FileHandler.myprint("Using Attention type : %s" % attn_model) self.attn_model = attn_model self.hidden_size = hidden_size self.output_size = output_size self.n_layers = n_layers self.dropout = dropout self.bidirectional = bidirectional self.embedding = embedding_layer embedding_size = embedding_layer.embedding_dim self.label_embedding = label_embedding_layer self.embedding_size = embedding_size self.use_label = use_label # for label embedding self.label_embedding_size = 0 self.embedding_dropout = nn.Dropout(dropout) self.input_feeding_size = 0 self.use_input_feeding = use_input_feeding if self.use_input_feeding: self.input_feeding_size = self.hidden_size # we want it to be same as hidden_size self.gru = nn.GRU(self.embedding_size + self.input_feeding_size, self.hidden_size, batch_first=True, bidirectional=self.bidirectional, num_layers=self.n_layers) num_direct = 1 self.concat = nn.Linear(hidden_size * 2 * num_direct, hidden_size) self.out = nn.Linear(hidden_size, output_size) self.atten_faster = DotAttention(attn_model, num_direct * hidden_size, hidden_size)
def __init__(self, queries_feats_path: str, docs_feats_path: str, fixed_len_left: int, fixed_len_right: int): """ Parameters ---------- queries_feats_path: `str` path to extracted features docs_feats_path: `str` path to extracted features """ self.queries_content, self.left_tensor_feats = self.load_elmo_features( queries_feats_path, fixed_len_left) self.docs_content, self.right_tensor_feats = self.load_elmo_features( docs_feats_path, fixed_len_right) FileHandler.myprint("Left Elmo tensor feats: " + str(self.left_tensor_feats.size())) FileHandler.myprint("Right Elmo tensor feats: " + str(self.right_tensor_feats.size()))
def _initialize(self, interactions: interactions.MatchInteraction): """ Parameters ---------- interactions: :class:`interactions.MatchInteraction` Returns ------- """ # put the model into cuda if use cuda self._net = my_utils.gpu(self._net, self._use_cuda) if self._optimizer_func is None: self._optimizer = optim.Adam(self._net.parameters(), weight_decay=self._reg_l2, lr=self._learning_rate) else: self._optimizer = self._optimizer_func(self._net.parameters()) # losses functions if self._loss == 'pointwise': self._loss_func = my_losses.pointwise_loss elif self._loss == "single_pointwise_square_loss": self._loss_func = my_losses.single_pointwise_square_loss elif self._loss == 'bpr': self._loss_func = my_losses.bpr_loss elif self._loss == 'hinge': self._loss_func = my_losses.hinge_loss elif self._loss == 'bce': # binary cross entropy self._loss_func = my_losses.pointwise_bceloss elif self._loss == "pce": self._loss_func = my_losses.positive_cross_entropy elif self._loss == "cosine_max_margin_loss_dvsh": self._loss_func = my_losses.cosine_max_margin_loss_dvsh elif self._loss == "cross_entropy": self._loss_func = my_losses.binary_cross_entropy_cls elif self._loss == "masked_cross_entropy": self._loss_func = my_losses.masked_binary_cross_entropy elif self._loss == "vanilla_cross_entropy": self._loss_func = my_losses.vanilla_cross_entropy elif self._loss == "regression_loss": self._loss_func = my_losses.regression_loss else: self._loss_func = my_losses.adaptive_hinge_loss FileHandler.myprint("Using: " + str(self._loss_func))
def load_from_file( file_path: str, mode: str = 'word2vec', term_index: mz.preprocessors.units.Vocabulary.TermIndex = None ) -> Embedding: """ Load embedding from `file_path`. :param file_path: Path to file. :param mode: Embedding file format mode, one of 'word2vec', 'fasttext' or 'glove'.(default: 'word2vec') :return: An :class:`matchzoo.embedding.Embedding` instance. """ embedding_data = {} output_dim = 0 count_word_hit = 0 if mode == 'word2vec' or mode == 'fasttext': with open(file_path, 'r') as f: output_dim = int(f.readline().strip().split(' ')[-1]) for line in f: current_line = line.rstrip().split(' ') if current_line[0] not in term_index: continue embedding_data[current_line[0]] = current_line[1:] count_word_hit += 1 elif mode == 'glove': with open(file_path, 'r', encoding="utf-8") as f: output_dim = len(f.readline().rstrip().split(' ')) - 1 f.seek(0) for line in f: current_line = line.rstrip().split(' ') if current_line[0] not in term_index: continue embedding_data[current_line[0]] = current_line[1:] count_word_hit += 1 else: raise TypeError( "%s is not a supported embedding type. `word2vec`, `fasttext` or `glove` expected." % mode) FileHandler.myprint("Word hit: " + str((count_word_hit, len(term_index))) + " " + str(count_word_hit / len(term_index) * 100)) return Embedding(embedding_data, output_dim)
def _rank2(self, test2: dict, test3: dict, topN: int, **kargs): assert len(test2) in KeyWordSettings.QueryCountTest, len(test2) result_test2, error_analysis_test2 = my_evaluator.eval_bm25( test2, topN, output_ranking=True) hits = result_test2["hits"] ndcg = result_test2["ndcg"] ndcg_at_1_val = result_test2["ndcg@1"] assert len(test3) in KeyWordSettings.QueryCountTest, len(test3) result_test3, error_analysis_test3 = my_evaluator.eval_bm25( test3, topN, output_ranking=True) hits_test = result_test3["hits"] ndcg_test = result_test3["ndcg"] ndcg_at_1_test = result_test3["ndcg@1"] FileHandler.save_error_analysis_test2( json.dumps(error_analysis_test2, sort_keys=True, indent=2)) FileHandler.save_error_analysis_test3( json.dumps(error_analysis_test3, sort_keys=True, indent=2)) FileHandler.myprint( 'Best Test2 hits@%d = %.5f | Best Test2 ndcg@%d = %.5f | Best Test2 ndcg@1 = %.5f ' '|Best Test3 hits@%d = %.5f |Best Test3 ndcg@%d = %.5f |Best Test3 ndcg@1 = %.5f ' % (topN, hits, topN, ndcg, ndcg_at_1_val, topN, hits_test, topN, ndcg_test, ndcg_at_1_test)) return hits, ndcg, hits_test, ndcg_test
def _rank(self, val_scores: dict, test_scores: dict, topN: int, **kargs): assert len(val_scores) in KeyWordSettings.QueryCountVal, len( val_scores) result_val, error_analysis_val = my_evaluator.eval_bm25( val_scores, topN, output_ranking=True) hits = result_val["hits"] ndcg = result_val["ndcg"] ndcg_at_1_val = result_val["ndcg@1"] assert len(test_scores) in KeyWordSettings.QueryCountTest, len( test_scores) result_test, error_analysis_test = my_evaluator.eval_bm25( test_scores, topN, output_ranking=True) hits_test = result_test["hits"] ndcg_test = result_test["ndcg"] ndcg_at_1_test = result_test["ndcg@1"] FileHandler.save_error_analysis_testing( json.dumps(error_analysis_test, sort_keys=True, indent=2)) FileHandler.save_error_analysis_validation( json.dumps(error_analysis_val, sort_keys=True, indent=2)) FileHandler.myprint( 'Best Vad hits@%d = %.5f | Best Vad ndcg@%d = %.5f ' '|Best Test hits@%d = %.5f |Best Test ndcg@%d = %.5f' '|Best Test ndcg@1 = %.5f ' % (topN, hits, topN, ndcg, topN, hits_test, topN, ndcg_test, ndcg_at_1_test)) return hits, ndcg, hits_test, ndcg_test
def fit(self, data_packs: List[matchzoo.DataPack]): """Need tested!!!!""" FileHandler.myprint("Loading images of queries.....") self.left_tensor, \ self.left_img_path2index, \ self.left_img_index2path, D1 = self.fit_side(data_packs, fat_pth_file = self.left_pth_file, max_len_images = self.max_num_left_images, side="left") FileHandler.myprint("Loading images of docs.....") self.right_tensor, \ self.right_img_path2index, \ self.right_img_index2path, D2 = self.fit_side(data_packs, fat_pth_file = self.right_pth_file, max_len_images = self.max_num_right_images, side="right") assert D1 == D2 self.visual_features_size = D1 FileHandler.myprint("Visual Feature Dimension: %s" % (self.visual_features_size))
def load_best_model(self, val_interactions: interactions.MatchInteraction, test_interactions: interactions.MatchInteraction): mymodel = self._net mymodel.load_state_dict(torch.load(self.saved_model)) mymodel.train(False) my_utils.gpu(mymodel, self._use_cuda) val_results = self.evaluate(val_interactions) val_loss = val_results["cross_entropy"] test_results = self.evaluate(test_interactions) test_loss = test_results["cross_entropy"] FileHandler.save_error_analysis_validation( json.dumps(val_results, sort_keys=True, indent=2)) FileHandler.save_error_analysis_testing( json.dumps(test_results, sort_keys=True, indent=2)) FileHandler.myprint('Best val loss = %.5f |Best Test loss = %.5f ' % (val_loss, test_loss)) return val_loss, test_loss
def load_best_model(self, val_interactions: interactions.MatchInteraction, test_interactions: interactions.MatchInteraction, topN: int): mymodel = self._net # print("Trained model: ", mymodel.out.weight) mymodel.load_state_dict(torch.load(self.saved_model)) mymodel.train(False) my_utils.gpu(mymodel, self._use_cuda) assert len(val_interactions.unique_queries_test ) in KeyWordSettings.QueryCountVal result_val, error_analysis_val = self.evaluate(val_interactions, topN, output_ranking=True) hits = result_val["hits"] ndcg = result_val["ndcg"] ndcg_at_1 = result_val["ndcg@1"] assert len(test_interactions.unique_queries_test ) in KeyWordSettings.QueryCountTest result_test, error_analysis_test = self.evaluate(test_interactions, topN, output_ranking=True) hits_test = result_test["hits"] ndcg_test = result_test["ndcg"] ndcg_at_1_test = result_test["ndcg@1"] FileHandler.save_error_analysis_validation( json.dumps(error_analysis_val, sort_keys=True, indent=2)) FileHandler.save_error_analysis_testing( json.dumps(error_analysis_test, sort_keys=True, indent=2)) FileHandler.myprint( 'Best Vad hits@%d = %.5f | Best Vad ndcg@%d = %.5f ' '|Best Test hits@%d = %.5f |Best Test ndcg@%d = %.5f' '|Best Vad ndcg@1 = %.5f |Best Test ndcg@1 = %.5f' % (topN, hits, topN, ndcg, topN, hits_test, topN, ndcg_test, ndcg_at_1, ndcg_at_1_test)) return hits, ndcg, hits_test, ndcg_test
def fit_models(args): if not os.path.exists(args.log): os.mkdir(args.log) curr_date = datetime.datetime.now().timestamp() # seconds # folder to store all outputed files of a run secondary_log_folder = os.path.join(args.log, "log_results_%s" % (int(curr_date))) if not os.path.exists(secondary_log_folder): os.mkdir(secondary_log_folder) logfolder_result = os.path.join(secondary_log_folder, "%s_result.txt" % int(curr_date)) FileHandler.init_log_files(logfolder_result) settings = json.dumps(vars(args), sort_keys=True, indent=2) FileHandler.myprint("Running script " + str(os.path.realpath(__file__))) FileHandler.myprint(settings) FileHandler.myprint("Setting seed to " + str(args.seed)) seed = args.seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False if args.cuda: torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) root = args.path t1 = time.time() train_pack = load_data.load_data2(root, 'train', prefix=args.dataset) valid_pack = load_data.load_data2(root, 'dev', prefix=args.dataset) predict_pack = load_data.load_data2(root, 'test', prefix=args.dataset) a = train_pack.left["text_left"].str.lower().str.split().apply(len).max() b = valid_pack.left["text_left"].str.lower().str.split().apply(len).max() c = predict_pack.left["text_left"].str.lower().str.split().apply(len).max() max_query_length = max([a, b, c]) min_query_length = min([a, b, c]) a = train_pack.right["text_right"].str.lower().str.split().apply(len).max() b = valid_pack.right["text_right"].str.lower().str.split().apply(len).max() c = predict_pack.right["text_right"].str.lower().str.split().apply( len).max() max_doc_length = max([a, b, c]) min_doc_length = min([a, b, c]) FileHandler.myprint("Min query length, " + str(min_query_length) + " Min doc length " + str(min_doc_length)) FileHandler.myprint("Max query length, " + str(max_query_length) + " Max doc length " + str(max_doc_length)) preprocessor = mz.preprocessors.SplitPreprocessor(args.fixed_length_left, args.fixed_length_right, vocab_file=os.path.join( args.path, "vocab.json")) print('parsing data') train_processed = preprocessor.fit_transform( train_pack) # This is a DataPack valid_processed = preprocessor.transform(valid_pack) predict_processed = preprocessor.transform(predict_pack) train_interactions = MatchInteraction(train_processed) valid_interactions = MatchInteraction(valid_processed) test_interactions = MatchInteraction(predict_processed) FileHandler.myprint('done extracting') t2 = time.time() FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1)) FileHandler.myprint("Building model") print("Loading word embeddings......") t1_emb = time.time() term_index = preprocessor.context['vocab_unit'].state['term_index'] default_embeddings = mz.datasets.embeddings.load_default_embedding( dimension=args.word_embedding_size, term_index=term_index) embedding_matrix = default_embeddings.build_matrix( term_index, initializer=lambda: np.random.normal(0, 1)) t2_emb = time.time() print("Time to load word embeddings......", (t2_emb - t1_emb)) params = dict() params['embedding'] = embedding_matrix params["embedding_freeze"] = False # trainable word embeddings params["fixed_length_left"] = args.fixed_length_left params["fixed_length_right"] = args.fixed_length_right params["embedding_output_dim"] = args.word_embedding_size params["embedding_dropout"] = args.embedding_dropout params["attention_type"] = args.attention_type params["hidden_size"] = args.hidden_size params["output_target_size"] = args.output_target_size params["bidirectional"] = False params["use_label"] = False params["use_input_feeding"] = args.use_input_feeding params["nlayers"] = 1 generative_model = fcrg_model.FCRGModel(params) FileHandler.myprint("Fitting Model") fit_model = basic_fitter.BasicFitter( net=generative_model, loss=args.loss_type, n_iter=args.epochs, batch_size=args.batch_size, learning_rate=args.lr, early_stopping=args.early_stopping, use_cuda=args.cuda, clip=args.clip, logfolder=secondary_log_folder, curr_date=curr_date, vocab=preprocessor.context['vocab_unit']) try: fit_model.fit(train_interactions, verbose=True, val_interactions=valid_interactions, test_interactions=test_interactions) fit_model.load_best_model(valid_interactions, test_interactions) except KeyboardInterrupt: FileHandler.myprint('Exiting from training early') t10 = time.time() FileHandler.myprint('Total time: %d (seconds)' % (t10 - t1))
def fit( self, train_iteractions: interactions.MatchInteraction, verbose=True, # for printing out evaluation during training topN=10, val_interactions: interactions.MatchInteraction = None, test_interactions: interactions.MatchInteraction = None): """ Fit the model. Parameters ---------- train_iteractions: :class:`matchzoo.DataPack` The input sequence dataset. val_interactions: :class:`matchzoo.DataPack` test_interactions: :class:`matchzoo.DataPack` """ self._initialize(train_iteractions) best_hit, best_ndcg, best_epoch, test_ndcg, test_hit = 0, 0, 0, 0, 0 test_results_dict = None iteration_counter = 0 count_patience_epochs = 0 for epoch_num in range(self._n_iter): # ------ Move to here ----------------------------------- # self._net.train(True) query_ids, left_contents, left_lengths, \ doc_ids, right_contents, right_lengths, \ neg_docs_ids, neg_docs_contents, neg_docs_lens = self._sampler.get_train_instances(train_iteractions, self._num_negative_samples) queries, query_content, query_lengths, \ docs, doc_content, doc_lengths, \ neg_docs, neg_docs_contents, neg_docs_lens = my_utils.shuffle(query_ids, left_contents, left_lengths, doc_ids, right_contents, right_lengths, neg_docs_ids, neg_docs_contents, neg_docs_lens) epoch_loss, total_pairs = 0.0, 0 t1 = time.time() for (minibatch_num, (batch_query, batch_query_content, batch_query_len, batch_doc, batch_doc_content, batch_docs_lens, batch_neg_docs, batch_neg_doc_content, batch_neg_docs_lens)) \ in enumerate(my_utils.minibatch(queries, query_content, query_lengths, docs, doc_content, doc_lengths, neg_docs, neg_docs_contents, neg_docs_lens, batch_size = self._batch_size)): # add idf here... query_idfs = None if len(TFIDF.get_term_idf()) != 0: query_idf_dict = TFIDF.get_term_idf() query_idfs = [[ query_idf_dict.get(int(word_idx), 0.0) for word_idx in row ] for row in batch_query_content] query_idfs = torch_utils.gpu( torch.from_numpy(np.array(query_idfs)).float(), self._use_cuda) batch_query = my_utils.gpu(torch.from_numpy(batch_query), self._use_cuda) batch_query_content = my_utils.gpu( torch.from_numpy(batch_query_content), self._use_cuda) batch_doc = my_utils.gpu(torch.from_numpy(batch_doc), self._use_cuda) batch_doc_content = my_utils.gpu( torch.from_numpy(batch_doc_content), self._use_cuda) batch_neg_doc_content = my_utils.gpu( torch.from_numpy(batch_neg_doc_content), self._use_cuda) total_pairs += self._batch_size * self._num_negative_samples self._optimizer.zero_grad() if self._loss in ["bpr", "hinge", "pce", "bce"]: loss = self._get_multiple_negative_predictions_normal( batch_query, batch_query_content, batch_doc, batch_doc_content, batch_neg_docs, batch_neg_doc_content, batch_query_len, batch_docs_lens, batch_neg_docs_lens, self._num_negative_samples, query_idf=query_idfs) epoch_loss += loss.item() iteration_counter += 1 # if iteration_counter % 2 == 0: break TensorboardWrapper.mywriter().add_scalar( "loss/minibatch_loss", loss.item(), iteration_counter) loss.backward() self._optimizer.step() epoch_loss /= float(total_pairs) TensorboardWrapper.mywriter().add_scalar("loss/epoch_loss_avg", epoch_loss, epoch_num) # print("Number of Minibatches: ", minibatch_num, "Avg. loss of epoch: ", epoch_loss) t2 = time.time() epoch_train_time = t2 - t1 if verbose: # validation after each epoch t1 = time.time() assert len(val_interactions.unique_queries_test ) in KeyWordSettings.QueryCountVal, len( val_interactions.unique_queries_test) result_val = self.evaluate(val_interactions, topN) hits = result_val["hits"] ndcg = result_val["ndcg"] t2 = time.time() valiation_time = t2 - t1 if epoch_num and epoch_num % self._testing_epochs == 0: t1 = time.time() assert len(test_interactions.unique_queries_test ) in KeyWordSettings.QueryCountTest result_test = self.evaluate(test_interactions, topN) hits_test = result_test["hits"] ndcg_test = result_test["ndcg"] t2 = time.time() testing_time = t2 - t1 TensorboardWrapper.mywriter().add_scalar( "hit/hit_test", hits_test, epoch_num) TensorboardWrapper.mywriter().add_scalar( "ndcg/ndcg_test", ndcg_test, epoch_num) FileHandler.myprint( '|Epoch %03d | Test hits@%d = %.5f | Test ndcg@%d = %.5f | Testing time: %04.1f(s)' % (epoch_num, topN, hits_test, topN, ndcg_test, testing_time)) TensorboardWrapper.mywriter().add_scalar( "hit/hits_val", hits, epoch_num) TensorboardWrapper.mywriter().add_scalar( "ndcg/ndcg_val", ndcg, epoch_num) FileHandler.myprint( '|Epoch %03d | Train time: %04.1f(s) | Train loss: %.3f' '| Vad hits@%d = %.5f | Vad ndcg@%d = %.5f | Validation time: %04.1f(s)' % (epoch_num, epoch_train_time, epoch_loss, topN, hits, topN, ndcg, valiation_time)) if hits > best_hit or (hits == best_hit and ndcg > best_ndcg): # if (hits + ndcg) > (best_hit + best_ndcg): count_patience_epochs = 0 with open(self.saved_model, "wb") as f: torch.save(self._net.state_dict(), f) # test_results_dict = result_test best_hit, best_ndcg, best_epoch = hits, ndcg, epoch_num # test_hit, test_ndcg = hits_test, ndcg_test else: count_patience_epochs += 1 if self._early_stopping_patience and count_patience_epochs > self._early_stopping_patience: FileHandler.myprint( "Early Stopped due to no better performance in %s epochs" % count_patience_epochs) break if np.isnan(epoch_loss) or epoch_loss == 0.0: raise ValueError( 'Degenerate epoch loss: {}'.format(epoch_loss)) FileHandler.myprint("Closing tensorboard") TensorboardWrapper.mywriter().close() FileHandler.myprint( 'Best result: | vad hits@%d = %.5f | vad ndcg@%d = %.5f | epoch = %d' % (topN, best_hit, topN, best_ndcg, best_epoch)) FileHandler.myprint_details( json.dumps(test_results_dict, sort_keys=True, indent=2))
def fit_models(args): if not os.path.exists(args.log): os.mkdir(args.log) curr_date = datetime.datetime.now().timestamp() # seconds # folder to store all outputed files of a run secondary_log_folder = os.path.join(args.log, "log_results_%s" % (int(curr_date))) if not os.path.exists(secondary_log_folder): os.mkdir(secondary_log_folder) logfolder_result = os.path.join(secondary_log_folder, "%s_result.txt" % int(curr_date)) FileHandler.init_log_files(logfolder_result) settings = json.dumps(vars(args), sort_keys=True, indent=2) FileHandler.myprint("Running script " + str(os.path.realpath(__file__))) FileHandler.myprint(settings) FileHandler.myprint("Setting seed to " + str(args.seed)) seed = args.seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False if args.cuda: torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) index2queries = dict( (y, x) for x, y in json.loads(open(args.query_mapped).read()).items()) index2docs = dict( (y, x) for x, y in json.loads(open(args.article_mapped).read()).items()) root = args.path use_reranking = "reranking" in root t1 = time.time() elmo_queries_path = os.path.join(args.elmo_feats, "queries_feats.pth") elmo_docs_path = os.path.join(args.elmo_feats, "articles_feats.pth") elmo_loader = load_data.ElmoLoader(elmo_queries_path, elmo_docs_path, args.fixed_length_left, args.fixed_length_right) load_data_func = elmo_loader.elmo_load_data train_pack = load_data_func(root, 'train', prefix=args.dataset) valid_pack = load_data_func(root, 'dev', prefix=args.dataset) predict_pack = load_data_func(root, 'test', prefix=args.dataset) if use_reranking: FileHandler.myprint("Using Re-Ranking Dataset..........") predict2_hard_pack = load_data_func(root, 'test2_hard', prefix=args.dataset) a = train_pack.left["text_left"].str.lower().str.split().apply(len).max() b = valid_pack.left["text_left"].str.lower().str.split().apply(len).max() c = predict_pack.left["text_left"].str.lower().str.split().apply(len).max() max_query_length = max([a, b, c]) min_query_length = min([a, b, c]) a = train_pack.right["text_right"].str.lower().str.split().apply(len).max() b = valid_pack.right["text_right"].str.lower().str.split().apply(len).max() c = predict_pack.right["text_right"].str.lower().str.split().apply( len).max() max_doc_length = max([a, b, c]) min_doc_length = min([a, b, c]) FileHandler.myprint("Min query length, " + str(min_query_length) + " Min doc length " + str(min_doc_length)) FileHandler.myprint("Max query length, " + str(max_query_length) + " Max doc length " + str(max_doc_length)) if args.use_visual: image_loader = load_data.ImagesLoader( left_pth_file=args.left_images_features, max_num_left_images=args.n_img_in_query, right_pth_file=args.right_images_features, max_num_right_images=args.n_img_in_doc, use_cuda=args.cuda) data_packs = [train_pack, valid_pack, predict_pack] if use_reranking: data_packs.append(predict2_hard_pack) image_loader.fit(data_packs) # memory-intensive (~10Gb RAM) train_pack = image_loader.transform(train_pack) valid_pack = image_loader.transform(valid_pack) predict_pack = image_loader.transform(predict_pack) if use_reranking: predict2_hard_pack = image_loader.transform(predict2_hard_pack) print(image_loader.left_tensor.size(), image_loader.right_tensor.size()) preprocessor = mz.preprocessors.ElmoPreprocessor(args.fixed_length_left, args.fixed_length_right) print('parsing data') train_processed = preprocessor.fit_transform( train_pack) # This is a DataPack valid_processed = preprocessor.transform(valid_pack) predict_processed = preprocessor.transform(predict_pack) train_interactions = MatchInteractionVisual(train_processed) valid_interactions = MatchInteractionVisual(valid_processed) test_interactions = MatchInteractionVisual(predict_processed) if use_reranking: predict2_processed = preprocessor.transform(predict2_hard_pack) predict2_interactions = MatchInteractionVisual(predict2_processed) FileHandler.myprint('done extracting') t2 = time.time() FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1)) FileHandler.myprint("Building model") print("Loading word embeddings......") t1_emb = time.time() term_index = preprocessor.context['vocab_unit'].state['term_index'] glove_embedding = mz.datasets.embeddings.load_glove_embedding( dimension=args.word_embedding_size, term_index=term_index) embedding_matrix = glove_embedding.build_matrix(term_index) l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] t2_emb = time.time() print("Time to load word embeddings......", (t2_emb - t1_emb)) match_params = {} match_params['embedding'] = embedding_matrix match_params["embedding_freeze"] = True # freezing word embeddings match_params["fixed_length_left"] = args.fixed_length_left match_params["fixed_length_right"] = args.fixed_length_right match_params['dropout'] = 0.1 match_params['filters'] = args.filters match_params["conv_layers"] = args.conv_layers match_params["filters_count_pacrr"] = args.filters_count_pacrr match_params["n_s"] = args.n_s match_params["max_ngram"] = args.max_ngram match_params["head_cnn_type"] = args.head_cnn_type match_params["use_visual"] = args.use_visual match_params[ "use_average_dcompositional_att"] = args.use_average_dcompositional_att match_params["attention_type"] = args.attention_type # contextualized part match_params["left_elmo_tensor"] = elmo_loader.left_tensor_feats match_params["right_elmo_tensor"] = elmo_loader.right_tensor_feats match_params["elmo_vec_size"] = 1024 if args.use_visual: match_params["visual_feature_size"] = image_loader.visual_features_size image_loader.left_tensor = torch_utils.gpu(image_loader.left_tensor, args.cuda) image_loader.right_tensor = torch_utils.gpu(image_loader.right_tensor, args.cuda) match_params["full_left_images_tensor"] = image_loader.left_tensor match_params["full_right_images_tensor"] = image_loader.right_tensor match_model = multimodal_attention_network.MultiModalAttentionNetwork( match_params) FileHandler.myprint("Fitting Model") if args.use_visual: FileHandler.myprint("Using both Textual and Visual features.......") fit_model = fitter.VisualFitter(net=match_model, loss=args.loss_type, n_iter=args.epochs, batch_size=args.batch_size, learning_rate=args.lr, early_stopping=args.early_stopping, use_cuda=args.cuda, num_negative_samples=args.num_neg, logfolder=secondary_log_folder, curr_date=curr_date, use_visual=args.use_visual, image_loader=image_loader, index2queries=index2queries, index2docs=index2docs) else: FileHandler.myprint("Using Textual content only....") fit_model = contextualized_fitter.ContextualizedFitter( net=match_model, loss=args.loss_type, n_iter=args.epochs, batch_size=args.batch_size, learning_rate=args.lr, early_stopping=args.early_stopping, use_cuda=args.cuda, num_negative_samples=args.num_neg, logfolder=secondary_log_folder, curr_date=curr_date) try: fit_model.fit(train_interactions, verbose=True, topN=args.topk, val_interactions=valid_interactions, test_interactions=test_interactions) fit_model.load_best_model(valid_interactions, test_interactions, topN=args.topk) if use_reranking: fit_model.load_best_model_test2_test3(predict2_interactions, None, topN=args.topk) except KeyboardInterrupt: FileHandler.myprint('Exiting from training early') t10 = time.time() FileHandler.myprint('Total time: %d (seconds)' % (t10 - t1))
def fit_models(args): if not os.path.exists(args.log): os.mkdir(args.log) curr_date = datetime.datetime.now().timestamp() # seconds # folder to store all outputed files of a run secondary_log_folder = os.path.join(args.log, "log_results_%s" % (int(curr_date))) if not os.path.exists(secondary_log_folder): os.mkdir(secondary_log_folder) logfolder_result = os.path.join(secondary_log_folder, "%s_result.txt" % int(curr_date)) FileHandler.init_log_files(logfolder_result) settings = json.dumps(vars(args), sort_keys=True, indent=2) FileHandler.myprint("Running script " + str(os.path.realpath(__file__))) FileHandler.myprint(settings) root = args.path train_pack = load_data.load_data2(root, 'train', prefix=args.dataset) valid_pack = load_data.load_data2(root, 'dev', prefix=args.dataset) predict_pack = load_data.load_data2(root, 'test', prefix=args.dataset) # print(train_pack.left) a = train_pack["text_left"].str.lower().str.split().apply(len).max() b = valid_pack["text_left"].str.lower().str.split().apply(len).max() c = predict_pack["text_left"].str.lower().str.split().apply(len).max() max_query_length = max([a, b, c]) min_query_length = min([a, b, c]) a = train_pack["text_right"].str.lower().str.split().apply(len).max() b = valid_pack["text_right"].str.lower().str.split().apply(len).max() c = predict_pack["text_right"].str.lower().str.split().apply(len).max() max_doc_length = max([a, b, c]) min_doc_length = min([a, b, c]) FileHandler.myprint("Min query length, " + str(min_query_length) + " Min doc length " + str(min_doc_length)) FileHandler.myprint("Max query length, " + str(max_query_length) + " Max doc length " + str(max_doc_length)) t1 = time.time() # get_query_docs(train_pack) dev_queries = get_query_docs(valid_pack) test_queries = get_query_docs(predict_pack) additional_data = {} if args.reranking: predict2_hard_pack = load_data.load_data2(root, 'test2_hard', prefix=args.dataset) predict3_hard_pack = load_data.load_data2(root, 'test3_hard', prefix=args.dataset) test2_queries = get_query_docs(predict2_hard_pack) test3_queries = get_query_docs(predict3_hard_pack) additional_data[KeyWordSettings.Test2Hard] = test2_queries additional_data[KeyWordSettings.Test3Hard] = test3_queries FileHandler.myprint('done extracting') t2 = time.time() FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1)) params = {"b": args.b, "k1": args.k1} """ Many other things""" FileHandler.myprint("Fitting Model") fit_model = bm25_fit.BM25Fitter(params) try: fit_model.fit(None, verbose=True, topN=args.topk, val_queries=dev_queries, test_queries=test_queries, **additional_data) except KeyboardInterrupt: FileHandler.myprint('Exiting from training early') t10 = time.time() FileHandler.myprint('Total time: %d (seconds)' % (t10 - t1))
def fit( self, train_iteractions: interactions.MatchInteraction, verbose=True, # for printing out evaluation during training val_interactions: interactions.MatchInteraction = None, test_interactions: interactions.MatchInteraction = None): """ Fit the model. Parameters ---------- train_iteractions: :class:`matchzoo.DataPack` The input sequence dataset. val_interactions: :class:`matchzoo.DataPack` test_interactions: :class:`matchzoo.DataPack` """ self._initialize() best_ce, best_epoch, test_ce = sys.maxsize, 0, 0 test_results_dict = None iteration_counter = 0 count_patience_epochs = 0 for epoch_num in range(self._n_iter): # ------ Move to here ----------------------------------- # self._net.train(True) query_ids, left_contents, left_lengths, \ doc_ids, right_contents, target_contents, right_lengths = self._sampler.get_instances(train_iteractions) queries, query_content, query_lengths, \ docs, doc_content, target_contents, doc_lengths = my_utils.shuffle(query_ids, left_contents, left_lengths, doc_ids, right_contents, target_contents, right_lengths) epoch_loss, total_pairs = 0.0, 0 t1 = time.time() for (minibatch_num, (batch_query, batch_query_content, batch_query_len, batch_doc, batch_doc_content, batch_doc_target, batch_docs_lens)) \ in enumerate(my_utils.minibatch(queries, query_content, query_lengths, docs, doc_content, target_contents, doc_lengths, batch_size = self._batch_size)): t3 = time.time() batch_query = my_utils.gpu(torch.from_numpy(batch_query), self._use_cuda) batch_query_content = my_utils.gpu( torch.from_numpy(batch_query_content), self._use_cuda) # batch_query_len = my_utils.gpu(torch.from_numpy(batch_query_len), self._use_cuda) batch_doc = my_utils.gpu(torch.from_numpy(batch_doc), self._use_cuda) batch_doc_content = my_utils.gpu( torch.from_numpy(batch_doc_content), self._use_cuda) batch_doc_target = my_utils.gpu( torch.from_numpy(batch_doc_target), self._use_cuda) # batch_docs_lens = my_utils.gpu(torch.from_numpy(batch_docs_lens), self._use_cuda) total_pairs += batch_query.size(0) # (batch_size) self._optimizer.zero_grad() loss = self._get_loss(batch_query, batch_query_content, batch_doc, batch_doc_content, batch_query_len, batch_docs_lens, batch_doc_target) epoch_loss += loss.item() iteration_counter += 1 # if iteration_counter % 2 == 0: break TensorboardWrapper.mywriter().add_scalar( "loss/minibatch_loss", loss.item(), iteration_counter) loss.backward() torch.nn.utils.clip_grad_norm_(self._net.parameters(), self._clip) self._optimizer.step() t4 = time.time() # if iteration_counter % 100 == 0: print("Running time for each mini-batch: ", (t4 - t3), "s") epoch_loss /= float(total_pairs) TensorboardWrapper.mywriter().add_scalar("loss/epoch_loss_avg", epoch_loss, epoch_num) # print("Number of Minibatches: ", minibatch_num, "Avg. loss of epoch: ", epoch_loss) t2 = time.time() epoch_train_time = t2 - t1 if verbose: # validation after each epoch t1 = time.time() result_val = self.evaluate(val_interactions) val_ce = result_val["cross_entropy"] t2 = time.time() validation_time = t2 - t1 TensorboardWrapper.mywriter().add_scalar( "cross_entropy/val_ce", val_ce, epoch_num) FileHandler.myprint( '|Epoch %03d | Train time: %04.1f(s) | Train loss: %.3f' '| Val loss = %.5f | Validation time: %04.1f(s)' % (epoch_num, epoch_train_time, epoch_loss, val_ce, validation_time)) if val_ce < best_ce: count_patience_epochs = 0 with open(self.saved_model, "wb") as f: torch.save(self._net.state_dict(), f) # test_results_dict = result_test best_ce, best_epoch = val_ce, epoch_num else: count_patience_epochs += 1 if self._early_stopping_patience and count_patience_epochs > self._early_stopping_patience: FileHandler.myprint( "Early Stopped due to no better performance in %s epochs" % count_patience_epochs) break if np.isnan(epoch_loss) or epoch_loss == 0.0: raise ValueError( 'Degenerate epoch loss: {}'.format(epoch_loss)) FileHandler.myprint("Closing tensorboard") TensorboardWrapper.mywriter().close() FileHandler.myprint( 'Best result: | vad cross_entropy = %.5f | epoch = %d' % (best_ce, best_epoch)) FileHandler.myprint_details( json.dumps(test_results_dict, sort_keys=True, indent=2))