def _load_test_recall_data(self, load_with_file_path=False): if load_with_file_path: self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data_path( 'test_subset_recall') self._temp_test_recall_source_labels = UTIL.load_embeddings( self._test_recall_source_labels) else: self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data( 'test_subset_recall') self._temp_test_recall_source_labels = self._test_recall_source_labels self._test_recall_baseline_source_embeddings = self._test_recall_source_embeddings if self.params.model['model_type'].lower() == 'conv': tokenized_documents = self._obtain_tokenized_documents( self._test_recall_source_indx) self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._pad_documents( tokenized_documents) if load_with_file_path: UTIL.dump_embeddings(self._test_recall_source_embeddings, self._test_recall_source_padded) UTIL.dump_embeddings( self._test_recall_source_embeddings_lengths, self._test_recall_source_length, dtype="int32") self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._test_recall_source_padded, self._test_recall_source_length else: self._test_recall_source_embeddings_lengths = np.zeros( [self._temp_test_recall_source_labels.shape[0], 1]) if load_with_file_path: UTIL.dump_embeddings( self._test_recall_source_embeddings_lengths, self._test_recall_source_length, dtype="int32") self._test_recall_source_embeddings_lengths = self._test_recall_source_length
def _load_predict_data(self, load_with_file_path=False): if load_with_file_path: self._source_embeddings = os.path.join( self.base_path, self.params.files['prediction']['source_embeddings']) self._source_padded = os.path.join( self.base_path, self.params.files['prediction']['source_padded']) self._source_length = os.path.join( self.base_path, self.params.files['prediction']['source_length']) self._baseline_source_embeddings = self._source_embeddings self._temp_source_embeddings = UTIL.load_embeddings( self._source_embeddings) else: self._source_embeddings = UTIL.load_embeddings( os.path.join( self.base_path, self.params.files['prediction']['source_embeddings'])) self._source_padded = None self._source_length = None self._baseline_source_embeddings = self._source_embeddings self._temp_source_embeddings = self._source_embeddings if self.KN_FILE_NAMES['DIR'].lower().startswith('qu'): tokenized_documents = self._tokenized_questions else: tokenized_documents = self._tokenized_paragraphs if self.params.model['model_type'].lower() == 'conv': self._source_embeddings, self._source_embeddings_lengths = self._pad_documents( tokenized_documents) if load_with_file_path: UTIL.dump_embeddings(self._source_embeddings, self._source_padded) UTIL.dump_embeddings(self._source_embeddings_lengths, self._source_length, dtype="int32") self._source_embeddings, self._source_embeddings_lengths = self._source_padded, self._source_length else: self._source_embeddings_lengths = np.zeros( [self._temp_source_embeddings.shape[0], 1]) if load_with_file_path: UTIL.dump_embeddings(self._source_embeddings_lengths, self._source_length, dtype="int32") self._source_embeddings_lengths = self._source_length
def _load_train_data(self, load_with_file_path=False): if load_with_file_path: ## LOAD WITH FILE PATHS self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data_path( 'train_loss') self._temp_train_source_labels = UTIL.load_embeddings( self._train_source_labels) else: ## LOAD WITH ACTUAL DATA self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data( 'train_loss') self._temp_train_source_labels = self._train_source_labels self._train_baseline_source_embeddings = self._train_source_embeddings if self.params.model['model_type'].lower() == 'conv': tokenized_documents = self._obtain_tokenized_documents( self._train_source_indx) self._train_source_embeddings, self._train_source_embeddings_lengths = self._pad_documents( tokenized_documents) if load_with_file_path: ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE UTIL.dump_embeddings(self._train_source_embeddings, self._train_source_padded) UTIL.dump_embeddings(self._train_source_embeddings_lengths, self._train_source_length, dtype="int32") self._train_source_embeddings, self._train_source_embeddings_lengths = self._train_source_padded, self._train_source_length else: self._train_source_embeddings_lengths = np.zeros( [self._temp_train_source_labels.shape[0], 1]) if load_with_file_path: ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE UTIL.dump_embeddings(self._train_source_embeddings_lengths, self._train_source_length, dtype="int32") self._train_source_embeddings_lengths = self._train_source_length
signature="default", as_dict=True)['elmo']) # for i, each_document in enumerate(tqdm(tokenized[begin_index:end_index], # total=len(tokenized[begin_index:end_index])), begin_index): for doc_index, embed_document in enumerate( enumerate(documents[begin_index:end_index]), begin_index): try: embed_index, each_document = embed_document _begining = 0 _ending = len(tokenized_documents[doc_index]) _d1 = d1[embed_index, _begining:_ending, :] _d1 = np.expand_dims(_d1, axis=1) UTIL.dump_embeddings( _d1, embedding_file.replace('@', 'LSTM1_' + str(doc_index))) _d2 = d2[embed_index, _begining:_ending, :] _d2 = np.expand_dims(_d2, axis=1) UTIL.dump_embeddings( _d2, embedding_file.replace('@', 'LSTM2_' + str(doc_index))) _delmo = delmo[embed_index, _begining:_ending, :] _delmo = np.expand_dims(_delmo, axis=1) UTIL.dump_embeddings( _delmo, embedding_file.replace('@', 'ELMO_' + str(doc_index))) except Exception as ex: print(ex) print('End of documents')
def execute_non_conv_pipeline(params, base_data_path, config, tf, databuilder): estimator = tf.estimator.Estimator(model_fn, params=params, config=config) if not params.executor["is_prediction"]: _train_input_fn = lambda: databuilder.train_input_fn() if params.executor["recall_calculation_for"] == 'test': _recall_input_fn = lambda: databuilder.test_recall_input_fn() else: _recall_input_fn = lambda: databuilder.train_recall_input_fn() if not params.executor["is_debug_mode"]: # ------------------------------------- # Train the model # ------------------------------------- tf.logging.info("Starting training for {} epoch(s).".format(params.model["num_epochs"])) tf.logging.info( "Train loss on train set with a size of {} ".format(params.files["splitter"]["train_size"])) # ------------------------------------- # First Train # ------------------------------------- #estimator.train(_train_input_fn, max_steps=1) # ------------------------------------- # Train and Test : Train # ------------------------------------- # train_spec = tf.estimator.TrainSpec(_train_input_fn, # max_steps= params.model['num_epochs'] * math.ceil(params.files['splitter']['train_size'] / params.model['batch_size'])) train_spec = tf.estimator.TrainSpec(_train_input_fn) # ------------------------------------- # Test the model # ------------------------------------- tf.logging.info("Evaluation loss and recall loss on test set.") tf.logging.info( "Evaluation loss on test set with a size of {} ".format(params.files["splitter"]["test_size"])) tf.logging.info( "Evaluation recall loss on test set with a size of {} ".format(params.files["splitter"]["test_subset_size"])) # ------------------------------------- # Baseline Eval for Initial model # ------------------------------------- # ------------------------------------- # Then Test # ------------------------------------- #estimator.evaluate(_recall_input_fn) # ------------------------------------- # Train and Test : Test # ------------------------------------- test_spec = tf.estimator.EvalSpec(_recall_input_fn) #steps=params.model['num_epochs'] # ------------------------------------- # Train and Test # ------------------------------------- tf.estimator.train_and_evaluate(estimator, train_spec, test_spec) if params.executor["is_prediction_during_training"]: predictions = estimator.predict(lambda: databuilder.predict_input_fn()) predictions = np.array(list(predictions)) dump_embeddings(predictions, os.path.join(base_data_path, "improved_" + params.files["prediction"][ "source_embeddings"])) else: tf.logging.info(10 * '*') tf.logging.info("Starting debugging for {} epoch(s).".format(params.model["num_epochs"])) data_dict = {} data_dict['epochs'] = params.model["num_epochs"] for ep in range(1,params.model["num_epochs"] + 1): tf.logging.info("-------> Epoch: {}".format(ep)) # if ep < 1: # estimator.train(_train_input_fn, max_steps=1) # else: estimator.train(_train_input_fn) tf.logging.info("-------> Epoch: {} Train is completed".format(ep)) result_as_dict_list = estimator.predict(_recall_input_fn) tf.logging.info("-------> Epoch: {} Predict is completed".format(ep)) data_dict = prepare_dict_to_print(result_as_dict_list, data_dict,ep) tf.logging.info("-------> Epoch: {} data_dict is completed".format(ep)) save_as_pickle(data_dict, os.path.join(model_save_path, 'debug_dict.pkl')) save_as_shelve(data_dict, os.path.join(model_save_path, 'debug_dict.slv')) tf.logging.info('Dict objs are saved.') tf.logging.info(10 * '*') # ------------------------------------- # Prediction # ------------------------------------- else: # ALL PARAMETERS SHOULD BE SET SAME WITH THE SAVED MODEL :( # I AM GOING TO HANDLE HOW TO SAVE IT WITH THE PARAM CONFS. predictions = estimator.predict(lambda: databuilder.predict_input_fn()) predictions = np.array(list(predictions)) dump_embeddings(predictions, os.path.join(base_data_path, "improved_" + params.files["prediction"]["source_embeddings"]))
def process_documents(partition, document_partition_size, checkpoint, jsons, tokenized_document_size, file_names, file_folder_path, ind_layer, conc_layers, all_tokens, contextualized_questions_with_token_file_path, dictionary_path): embeddings = None start = partition end = ( partition + document_partition_size ) if document_partition_size is not None else tokenized_document_size local_tokens = [] for indx in tqdm(range(start, end)): if tokenized_document_size > indx: bert_index = indx + 1 file_name, remaining_index_to_pass_this_file = find_file_name( bert_index, file_names) if remaining_index_to_pass_this_file >= 0: jsons = UTIL.load_bert_jsons_from_single_file( os.path.join(file_folder_path, file_name)) if indx > 0: checkpoint = indx if checkpoint is not None: indx = indx - checkpoint new_token = [] token_embeddings = None for line_index, json in UTIL.reversedEnumerate(jsons[indx]): # 0 and -1 token indexes belong to [CLS, SEP] we are ignoring them. json['features'].pop(0) json['features'].pop(-1) # filter out the non-contributional tokens from the list. features = [ x for x in json['features'] if not x['token'].startswith("##") ] for feature_index, feature in UTIL.reversedEnumerate(features): if line_index > 0 and feature_index < args.window_length: # print(feature['token']) continue if args.ind_layer is not None: token_embedding = np.array([ l['values'] for l in feature['layers'] if l['index'] == ind_layer ]) else: token_embedding = np.concatenate([ l['values'] for l in feature['layers'] if l['index'] in conc_layers ]) if token_embeddings is None: token_embeddings = token_embedding else: token_embeddings = np.vstack( (token_embeddings, token_embedding)) new_token.append(feature['token']) if len(new_token) != token_embeddings.shape[0]: print(30 * '*') print( '********** Size of token embeddings {} has problem in {} checkpoint **********' .format(indx, checkpoint)) print(30 * '*') all_tokens.append(new_token) local_tokens.append(new_token) # TOKEN DEBUGGING # print("*" * 25) # print("Sub Token size in dictionary for the document {} in the partition: {}".format(indx, len(new_token))) # print("Total Token size in dictionary after the document {} in the partition: {}".format(indx, sum([len(sentence) for sentence in all_tokens]))) # print("*" * 25) if embeddings is None: embeddings = token_embeddings else: embeddings = np.vstack((embeddings, token_embeddings)) print('embeddings shape: {}'.format(embeddings.shape)) UTIL.dump_embeddings(embeddings, contextualized_questions_with_token_file_path) UTIL.save_as_pickle(local_tokens, dictionary_path) print('embeddings are dumped') return jsons, checkpoint, embeddings.shape ## TOKEN DEBUGGING
def main(args): ################ CONFIGURATIONS ################# squad_formatted_file = os.path.join(args.data_path, args.squad_formatted_file) bert_extension = ".json" file_name_splitter = '_' document_embeddings = None questions_folder_path = os.path.join(args.data_path, 'questions') paragraphs_folder_path = os.path.join(args.data_path, 'paragraphs') new_question_tokens_path = os.path.join(args.data_path, 'questions_tokens@@.pkl') new_paragraph_tokens_path = os.path.join(args.data_path, 'paragraphs_tokens@@.pkl') calculated_token_embeddings_file_path = os.path.join( args.data_path, 'contextualized_document_embeddings_with_token_##_@@.hdf5') vocab_path = os.path.join(args.data_path, 'wordpiece_vocab.txt') ind_layer = None conc_layers = None test_size = None if args.test_size is not None: test_size = [int(x) for x in args.test_size.split(",")] if args.ind_layer is not None: ind_layer = int(args.ind_layer) contextualized_questions_with_token_file_path = os.path.join( args.data_path, "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5" .format(args.ind_layer)) contextualized_paragraphs_with_token_file_path = os.path.join( args.data_path, "contextualized_paragraphs_embeddings_with_tokens_{}_layers.hdf5_@@" .format(args.ind_layer)) contextualized_document_embeddings_with_token_path = os.path.join( args.data_path, "contextualized_document_embeddings_with_token_{}_layers.hdf5". format(args.ind_layer)) final_questions_file_path = os.path.join( args.data_path, "question_document_embeddings_{}_layers_@@.hdf5".format( args.ind_layer)) final_paragraphs_file_path = os.path.join( args.data_path, "paragraph_document_embeddings_{}_layers_@@.hdf5".format( args.ind_layer)) else: conc_layers = [int(x) for x in args.conc_layers.split(",")] contextualized_questions_with_token_file_path = os.path.join( args.data_path, "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5" .format(conc_layers)) contextualized_paragraphs_with_token_file_path = os.path.join( args.data_path, "contextualized_paragraphs_embeddings_with_tokens_{}_layers_@@.hdf5" .format(conc_layers)) contextualized_document_embeddings_with_token_path = os.path.join( args.data_path, "contextualized_document_embeddings_with_token_{}_layers.hdf5". format(conc_layers)) final_questions_file_path = os.path.join( args.data_path, "question_document_embeddings_{}_layers_@@.hdf5".format( conc_layers)) final_paragraphs_file_path = os.path.join( args.data_path, "paragraph_document_embeddings_{}_layers_@@.hdf5".format( conc_layers)) if args.ind_layer is None and args.conc_layers is None: raise Exception('There must be some layer configurations !!!') if args.ind_layer is not None and args.conc_layers is not None: raise Exception('There must only one layer configuration !!!') # ################ CONFIGURATIONS ################# """ ****************************************************************************************************************** ****************************************************************************************************************** START: PARSING FILE ****************************************************************************************************************** ****************************************************************************************************************** """ tokenized_questions, tokenized_paragraphs, questions_nontokenized, paragraphs_nontokenized = UTIL.prepare_squad_objects( squad_formatted_file, args.squad_formatted_file) """ ****************************************************************************************************************** ****************************************************************************************************************** END: PARSING FILE ****************************************************************************************************************** ****************************************************************************************************************** """ """ ****************************************************************************************************************** ****************************************************************************************************************** START: LOAD EMBEDINGS ****************************************************************************************************************** ****************************************************************************************************************** """ new_question_tokens = [] is_questions_already_processed = False if os.path.exists( contextualized_questions_with_token_file_path.replace('@@', '')): is_questions_already_processed = True else: file_names = get_file_names(questions_folder_path, file_name_splitter, bert_extension) tokenized_questions_size = test_size[ 0] if test_size is not None else len(tokenized_questions) checkpoint = None jsons = None if args.is_parititioned is True: partition_counter = 0 for _p_counter in tqdm( range(0, tokenized_questions_size, args.document_partition_size)): print("Partition {} is running for writing questions".format( partition_counter)) if not os.path.exists( contextualized_questions_with_token_file_path.replace( '@@', str(partition_counter))): # TOKEN DEBUGGING #tokens_size_before_partition = sum([len(sentence) for sentence in new_question_tokens]) jsons, checkpoint, partition_shape = process_documents( _p_counter, args.document_partition_size, checkpoint, jsons, tokenized_questions_size, file_names, questions_folder_path, ind_layer, conc_layers, new_question_tokens, contextualized_questions_with_token_file_path.replace( '@@', str(partition_counter)), new_question_tokens_path.replace( '@@', str(partition_counter))) # TOKEN DEBUGGING # tokens_size_after_partition = sum([len(sentence) for sentence in new_question_tokens]) # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]: # print("*" * 25) # print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0])) # print("*" * 25) else: new_question_tokens.extend( UTIL.load_from_pickle( new_question_tokens_path.replace( '@@', str(partition_counter)))) partition_counter += 1 question_embeddings = None for _p_counter in tqdm(range(0, partition_counter)): print("Partition {} is running for reading questions".format( partition_counter)) temp_question_embeddings = UTIL.load_embeddings( contextualized_questions_with_token_file_path.replace( "@@", str(_p_counter))) if question_embeddings is None: question_embeddings = temp_question_embeddings else: question_embeddings = np.vstack( (question_embeddings, temp_question_embeddings)) print('MAIN embeddings shape: {}'.format( question_embeddings.shape)) UTIL.dump_embeddings( question_embeddings, contextualized_questions_with_token_file_path.replace( '@@', '')) print('MAIN embeddings are dumped') else: print("It is running for writing questions") jsons, checkpoint, partition_shape = process_documents( 0, None, checkpoint, jsons, tokenized_questions_size, file_names, questions_folder_path, ind_layer, conc_layers, new_question_tokens, contextualized_questions_with_token_file_path.replace( '@@', '')) UTIL.save_as_pickle(new_question_tokens, new_question_tokens_path.replace('@@', '')) ## *************************************************************************************************************** ## *************************************************************************************************************** ## *************************************************************************************************************** new_paragraph_tokens = [] is_paragraphs_already_processed = False if os.path.exists( contextualized_paragraphs_with_token_file_path.replace('@@', '')): is_paragraphs_already_processed = True else: file_names = get_file_names(paragraphs_folder_path, file_name_splitter, bert_extension) tokenized_paragraphs_size = test_size[ 1] if test_size is not None else len(tokenized_paragraphs) checkpoint = None jsons = None if args.is_parititioned is True: partition_counter = 0 for _p_counter in tqdm( range(0, tokenized_paragraphs_size, args.document_partition_size)): print("Partition {} is running for writing paragraphs".format( partition_counter)) if not os.path.exists( contextualized_paragraphs_with_token_file_path.replace( '@@', str(partition_counter))): #tokens_size_before_partition = sum([len(sentence) for sentence in new_paragraph_tokens]) jsons, checkpoint, partition_shape = process_documents( _p_counter, args.document_partition_size, checkpoint, jsons, tokenized_paragraphs_size, file_names, paragraphs_folder_path, ind_layer, conc_layers, new_paragraph_tokens, contextualized_paragraphs_with_token_file_path.replace( '@@', str(partition_counter)), new_paragraph_tokens_path.replace( '@@', str(partition_counter))) else: new_paragraph_tokens.extend( UTIL.load_from_pickle( new_paragraph_tokens_path.replace( '@@', str(partition_counter)))) partition_counter += 1 # TOKEN DEBUGGING # tokens_size_after_partition = sum([len(sentence) for sentence in new_paragraph_tokens]) # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]: # print("*" * 25) # print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0])) # print("*" * 25) paragraph_embeddings = None for _p_counter in tqdm(range(0, partition_counter)): print("Partition {} is running for reading paragraphs".format( partition_counter)) temp_paragraph_embeddings = UTIL.load_embeddings( contextualized_paragraphs_with_token_file_path.replace( "@@", str(_p_counter))) if paragraph_embeddings is None: paragraph_embeddings = temp_paragraph_embeddings else: paragraph_embeddings = np.vstack( (paragraph_embeddings, temp_paragraph_embeddings)) print('MAIN embeddings shape: {}'.format( paragraph_embeddings.shape)) UTIL.dump_embeddings( paragraph_embeddings, contextualized_paragraphs_with_token_file_path.replace( '@@', '')) print('MAIN embeddings are dumped') else: print("It is running for writing paragraphs") jsons, checkpoint, partition_shape = process_documents( 0, None, checkpoint, jsons, tokenized_paragraphs_size, file_names, paragraphs_folder_path, ind_layer, conc_layers, new_paragraph_tokens, contextualized_paragraphs_with_token_file_path.replace( '@@', '')) UTIL.save_as_pickle(new_paragraph_tokens, new_paragraph_tokens_path.replace('@@', '')) if is_questions_already_processed: question_embeddings = UTIL.load_embeddings( contextualized_questions_with_token_file_path.replace('@@', '')) new_question_tokens = UTIL.load_from_pickle( new_question_tokens_path.replace('@@', '')) if is_paragraphs_already_processed: paragraph_embeddings = UTIL.load_embeddings( contextualized_paragraphs_with_token_file_path.replace('@@', '')) new_paragraph_tokens = UTIL.load_from_pickle( new_paragraph_tokens_path.replace('@@', '')) if os.path.exists(contextualized_document_embeddings_with_token_path): if args.is_parititioned is not True: document_embeddings = UTIL.load_embeddings( contextualized_document_embeddings_with_token_path) else: document_embeddings = np.vstack( (question_embeddings, paragraph_embeddings)) UTIL.dump_embeddings( document_embeddings, contextualized_document_embeddings_with_token_path) del question_embeddings del paragraph_embeddings print('All Documents are dumped') """ ****************************************************************************************************************** ****************************************************************************************************************** END: LOAD EMBEDINGS ****************************************************************************************************************** ****************************************************************************************************************** """ document_embedding_guideline, corpus_as_tokens = UTIL.generate_document_embedding_guideline( new_question_tokens, new_paragraph_tokens) paragraphs_nontokenized = [ " ".join(context) for context in new_paragraph_tokens ] questions_nontokenized = [ " ".join(context) for context in new_question_tokens ] """ ****************************************************************************************************************** ****************************************************************************************************************** START: IDF ****************************************************************************************************************** ****************************************************************************************************************** """ if args.is_inject_idf: print('IDF is going to be calculated') # vocab = [] # for sentence in new_question_tokens + new_paragraph_tokens: # for word in sentence: # vocab.append(word) # vocab = set(vocab) # UTIL.dump_vocab(vocab_path, vocab) #tokenize = wordpiece.FullTokenizer(vocab_file=vocab_path, do_lower_case=False) nlp = spacy.blank("en") tokenize = lambda doc: [token.text for token in nlp(doc)] start = datetime.datetime.now() token2idfweight, idf_vec = UTIL.transform_to_idf_weigths( new_question_tokens, new_paragraph_tokens, tokenize, questions_nontokenized, paragraphs_nontokenized) if args.is_parititioned is True: with h5py.File(contextualized_document_embeddings_with_token_path, 'r') as fin: partition_counter = 0 for partition in range(0, idf_vec.shape[0], args.token_partition_size): partition_counter += 1 temp_doc_embeddings = fin['embeddings'][ partition:partition + args.token_partition_size, :] temp_idf_vec = idf_vec[ partition:partition + args.token_partition_size, :].reshape(-1, 1) #temp_doc_embeddings = temp_doc_embeddings[:,0,:] #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2') temp_weighted_token_embeddings = np.multiply( temp_idf_vec, temp_doc_embeddings) UTIL.dump_embeddings( temp_weighted_token_embeddings, calculated_token_embeddings_file_path.replace( '@@', str(partition_counter)).replace('##', 'idf')) print( "Partition {} is completed and processed {} - {} tokens" .format(partition_counter, partition, partition + args.token_partition_size)) else: idf_vec = idf_vec.reshape(-1, 1) weighted_token_embeddings = np.multiply(idf_vec, document_embeddings) del idf_vec del token2idfweight end = datetime.datetime.now() print('IDF calculation is ended in {} minutes'.format( (end - start).seconds / 60)) else: print('IDF is skipped') _type = 'only' if args.is_parititioned is True: with h5py.File(contextualized_document_embeddings_with_token_path, 'r') as fin: partition_counter = 0 for partition in range(0, len(corpus_as_tokens), args.token_partition_size): partition_counter += 1 temp_doc_embeddings = fin['embeddings'][ partition:partition + args.token_partition_size, :] #temp_doc_embeddings = temp_doc_embeddings[:, 0, :] #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2') UTIL.dump_embeddings( temp_doc_embeddings, calculated_token_embeddings_file_path.replace( '@@', str(partition_counter)).replace('##', '')) print( "Partition {} is completed and processed {} - {} tokens" .format(partition_counter, partition, partition + args.token_partition_size)) else: weighted_token_embeddings = document_embeddings """ ****************************************************************************************************************** ****************************************************************************************************************** END: LOAD IDF ****************************************************************************************************************** ****************************************************************************************************************** """ """ ****************************************************************************************************************** ****************************************************************************************************************** START: WEIGHTED ARE GETTING APPLIED TO TOKEN EMBEDDINGS ****************************************************************************************************************** ****************************************************************************************************************** """ del document_embeddings #LOAD PARTIAL FILES AFTER CLEANING THE DOCUMENT EMBEDDINGS. if args.is_parititioned is True: weighted_token_embeddings = None for partition in range(1, partition_counter + 1): temp_weighted_token_embeddings = UTIL.load_embeddings( calculated_token_embeddings_file_path.replace( '@@', str(partition)).replace( '##', 'idf' if args.is_inject_idf else '')) if weighted_token_embeddings is None: weighted_token_embeddings = temp_weighted_token_embeddings else: weighted_token_embeddings = np.vstack( (weighted_token_embeddings, temp_weighted_token_embeddings)) print("Partition {} is loaded".format(partition)) WM = None #np.array(args['weights_arguments']).reshape((1, len(args['weights_arguments']), 1)) questions_embeddings, paragraphs_embeddings = UTIL.token_to_document_embeddings( new_question_tokens, new_paragraph_tokens, weighted_token_embeddings, document_embedding_guideline, WM) if args.is_inject_idf: questions_elmo_embeddings = np.reshape( questions_embeddings, (questions_embeddings.shape[0], questions_embeddings.shape[1])) UTIL.dump_embeddings( questions_elmo_embeddings, final_questions_file_path.replace('@@', 'with_idf')) paragraphs_elmo_embeddings = np.reshape( paragraphs_embeddings, (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1])) UTIL.dump_embeddings( paragraphs_elmo_embeddings, final_paragraphs_file_path.replace('@@', 'with_idf')) else: questions_elmo_embeddings = np.reshape( questions_embeddings, (questions_embeddings.shape[0], questions_embeddings.shape[1])) UTIL.dump_embeddings(questions_elmo_embeddings, final_questions_file_path.replace('@@', '')) paragraphs_elmo_embeddings = np.reshape( paragraphs_embeddings, (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1])) UTIL.dump_embeddings(paragraphs_elmo_embeddings, final_paragraphs_file_path.replace('@@', '')) print('Weighted are applied') """
def dump_splitted_train_test(question_embeddings, paragraph_embeddings, labels, prefix, path, partition_size): UTIL.dump_embeddings(labels['q'], os.path.join(path, prefix + "_question_idx.hdf5")) UTIL.dump_embeddings(labels['p'], os.path.join(path, prefix + "_question_labels.hdf5"),dtype='int32') range_size = math.ceil(question_embeddings.shape[0]/partition_size) for part in range(0, range_size): pair_paragraph_embeddings = None start = part * partition_size end = start + partition_size for q_indx, q_embed in tqdm(enumerate(question_embeddings[start:end])): if pair_paragraph_embeddings is None: pair_paragraph_embeddings = paragraph_embeddings[labels['p'][q_indx]] else: pair_paragraph_embeddings = np.vstack( (pair_paragraph_embeddings, paragraph_embeddings[labels['p'][q_indx]])) UTIL.dump_embeddings(pair_paragraph_embeddings, os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part))) pair_paragraph_embeddings = None for part in range(0, range_size): embeddings = UTIL.load_embeddings(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part))) if pair_paragraph_embeddings is None: pair_paragraph_embeddings = embeddings else: pair_paragraph_embeddings = np.vstack( (pair_paragraph_embeddings,embeddings)) UTIL.dump_embeddings(pair_paragraph_embeddings, os.path.join(path, prefix + "_paired_paragraph_embeddings.hdf5")) for part in range(0, range_size): os.remove(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part))) UTIL.dump_embeddings(question_embeddings, os.path.join(path,prefix + '_question_embeddings.hdf5')) UTIL.dump_embeddings(paragraph_embeddings, os.path.join(path, prefix + "_all_paragraph_embeddings.hdf5"))
token2idfweight, idf_vec = UTIL.transform_to_idf_weigths(tokenized_questions, tokenized_paragraphs, tokenize, questions_nontokenized, paragraphs_nontokenized) mean_glove_with_idf_embeddings = np.array([ np.mean([glove_word_weights[w] * token2idfweight[w] for w in words if w in glove_word_weights] or [np.zeros(dim)], axis=0) for words in tokenized_questions + tokenized_paragraphs ]) # UTIL.dump_embeddings(mean_glove_with_idf_embeddings, # os.path.join(datadir, 'dev_mean_glove_with_idf_embeddings.hdf5')) question_embeddings = mean_glove_with_idf_embeddings[0:len(tokenized_questions), :] UTIL.dump_embeddings(question_embeddings, os.path.join(datadir, '{}_glove_questions_embeddings_with_idf.hdf5'.format(dataset_type))) paragraphs_embeddings = mean_glove_with_idf_embeddings[len(tokenized_questions):, :] UTIL.dump_embeddings(paragraphs_embeddings, os.path.join(datadir, '{}_glove_paragraphs_embeddings_with_idf.hdf5'.format(dataset_type))) end = datetime.datetime.now() print('IDF calculation is ended in {} minutes'.format((end - start).seconds / 60)) else: print('IDF is skipped') mean_glove_embeddings = np.array([ np.mean([glove_word_weights[w] for w in words if w in glove_word_weights] or [np.zeros(dim)], axis=0) for words in tokenized_questions + tokenized_paragraphs ]) #UTIL.dump_embeddings(mean_glove_embeddings, os.path.join(datadir, 'dev_mean_glove_embeddings.hdf5')) question_embeddings = mean_glove_embeddings[0:len(tokenized_questions), :] UTIL.dump_embeddings(question_embeddings, os.path.join(datadir, '{}_glove_questions_embeddings.hdf5'.format(dataset_type))) paragraphs_embeddings = mean_glove_embeddings[len(tokenized_questions):, :]
) != question_embedding.shape[0]: print(30 * '*') print('********** Question {} has problem **********'.format( question_indx)) print(30 * '*') print( 'Question {} is processed. It has {} tokens and embedding shape is {} so {}' .format( question_indx, len(tokenized_questions[question_indx]), question_embedding.shape[0], len(tokenized_questions[question_indx]) == question_embedding.shape[0])) print('Question_embeddings shape: {}'.format(question_embeddings.shape)) UTIL.dump_embeddings( question_embeddings, os.path.join(root_folder_path, args['contextualized_questions_embeddings_with_token'])) print('Questions are dumped') paragraphs_folder_path = root_folder_path if args[ "embedding_paragraphs_path"] is None else os.path.join( root_folder_path, args["embedding_paragraphs_path"]) paragraph_embeddings = None if os.path.exists( os.path.join(root_folder_path, args['contextualized_paragraphs_embeddings_with_token'])): print('contextualized_paragraphs_embeddings_with_token found') paragraph_embeddings = UTIL.load_embeddings( os.path.join(root_folder_path, args['contextualized_paragraphs_embeddings_with_token'])) else: