def _load_data(self, data_type):
     source_labels = UTIL.load_embeddings(
         os.path.join(
             self.data_dir, self.params.files[data_type][
                 self.KN_FILE_NAMES['KN_SOURCE_LABELS']])).astype(int)
     source_indx = UTIL.load_embeddings(
         os.path.join(
             self.data_dir, self.params.files[data_type][
                 self.KN_FILE_NAMES['KN_SOURCE_IDX']])).astype(int)
     source_embeddings = UTIL.load_embeddings(
         os.path.join(
             self.data_dir, self.params.files[data_type][
                 self.KN_FILE_NAMES['KN_SOURCE_EMBEDDINGS']]))
     target_embeddings = UTIL.load_embeddings(
         os.path.join(
             self.data_dir, self.params.files[data_type][
                 self.KN_FILE_NAMES['KN_TARGET_EMBEDDINGS']]))
     source_padded = None
     source_length = None
     try:
         all_target_embeddings = UTIL.load_embeddings(
             os.path.join(
                 self.data_dir, self.params.files[data_type][
                     "all_" + self.KN_FILE_NAMES['KN_TARGET_EMBEDDINGS']]))
     except:
         print('{} is not found for {} type'.format(
             "all_" + self.KN_FILE_NAMES['KN_TARGET_EMBEDDINGS'],
             data_type))
         all_target_embeddings = None
     return source_labels, source_indx, source_embeddings, target_embeddings, all_target_embeddings, source_padded, source_length
 def _load_test_recall_data(self, load_with_file_path=False):
     if load_with_file_path:
         self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data_path(
             'test_subset_recall')
         self._temp_test_recall_source_labels = UTIL.load_embeddings(
             self._test_recall_source_labels)
     else:
         self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data(
             'test_subset_recall')
         self._temp_test_recall_source_labels = self._test_recall_source_labels
     self._test_recall_baseline_source_embeddings = self._test_recall_source_embeddings
     if self.params.model['model_type'].lower() == 'conv':
         tokenized_documents = self._obtain_tokenized_documents(
             self._test_recall_source_indx)
         self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._pad_documents(
             tokenized_documents)
         if load_with_file_path:
             UTIL.dump_embeddings(self._test_recall_source_embeddings,
                                  self._test_recall_source_padded)
             UTIL.dump_embeddings(
                 self._test_recall_source_embeddings_lengths,
                 self._test_recall_source_length,
                 dtype="int32")
             self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._test_recall_source_padded, self._test_recall_source_length
     else:
         self._test_recall_source_embeddings_lengths = np.zeros(
             [self._temp_test_recall_source_labels.shape[0], 1])
         if load_with_file_path:
             UTIL.dump_embeddings(
                 self._test_recall_source_embeddings_lengths,
                 self._test_recall_source_length,
                 dtype="int32")
             self._test_recall_source_embeddings_lengths = self._test_recall_source_length
    def _load_predict_data(self, load_with_file_path=False):
        if load_with_file_path:
            self._source_embeddings = os.path.join(
                self.base_path,
                self.params.files['prediction']['source_embeddings'])
            self._source_padded = os.path.join(
                self.base_path,
                self.params.files['prediction']['source_padded'])
            self._source_length = os.path.join(
                self.base_path,
                self.params.files['prediction']['source_length'])
            self._baseline_source_embeddings = self._source_embeddings
            self._temp_source_embeddings = UTIL.load_embeddings(
                self._source_embeddings)
        else:
            self._source_embeddings = UTIL.load_embeddings(
                os.path.join(
                    self.base_path,
                    self.params.files['prediction']['source_embeddings']))
            self._source_padded = None
            self._source_length = None
            self._baseline_source_embeddings = self._source_embeddings
            self._temp_source_embeddings = self._source_embeddings

        if self.KN_FILE_NAMES['DIR'].lower().startswith('qu'):
            tokenized_documents = self._tokenized_questions
        else:
            tokenized_documents = self._tokenized_paragraphs
        if self.params.model['model_type'].lower() == 'conv':
            self._source_embeddings, self._source_embeddings_lengths = self._pad_documents(
                tokenized_documents)
            if load_with_file_path:
                UTIL.dump_embeddings(self._source_embeddings,
                                     self._source_padded)
                UTIL.dump_embeddings(self._source_embeddings_lengths,
                                     self._source_length,
                                     dtype="int32")
                self._source_embeddings, self._source_embeddings_lengths = self._source_padded, self._source_length
        else:
            self._source_embeddings_lengths = np.zeros(
                [self._temp_source_embeddings.shape[0], 1])
            if load_with_file_path:
                UTIL.dump_embeddings(self._source_embeddings_lengths,
                                     self._source_length,
                                     dtype="int32")
                self._source_embeddings_lengths = self._source_length
 def _obtain_tokenized_documents(self, source_indx):
     documents = []
     if self.load_with_file_path:
         source_indx = UTIL.load_embeddings(source_indx).astype(int)
     for indx in tqdm(source_indx):
         if self.KN_FILE_NAMES['DIR'].lower().startswith('qu'):
             document = self._questions_nontokenized[indx]
         else:
             document = self._paragraphs_nontokenized[indx]
         documents.append(document)
     tokenized_documents = [document.split(' ') for document in documents]
     return tokenized_documents
Esempio n. 5
0
def eval_metrics_fn(base_data_path, params, before_model_embeddings, after_model_embeddings, KN_FILE_NAMES):
    """
    # Evaluate the model's recall on the test set
    # 5k questions, ~20k paragraphs, can not be handled in estimator api (number of questions and number of paragraphs are not same)
    # In order to accelarete the calculation, I prefer doing the following
    """

    all_targets = tf.constant(load_embeddings(os.path.join(base_data_path,
                                                          params.files[params.executor["recall_calculation_for"] + "_subset_recall"]["all_" + KN_FILE_NAMES["KN_TARGET_EMBEDDINGS"]])))


    normalized_all_targets = tf.nn.l2_normalize(all_targets,
                                                   name='normalized_all_targets_embeddings',
                                                   axis=1)


    subset_labels = tf.constant(load_embeddings(os.path.join(base_data_path,
                                                             params.files[params.executor[
                                                                              "recall_calculation_for"] + "_subset_recall"][
                                                                  KN_FILE_NAMES["KN_SOURCE_LABELS"]])))
    subset_labels = tf.reshape(subset_labels, [-1, 1])

    # AVG RECALLS FOR ALL RECALL_TOPS
    eval_metrics_after = evaluation_metrics(after_model_embeddings,
                                                        normalized_all_targets,
                                                        subset_labels,
                                                        params,
                                                        distance_type=params.executor["distance_type"])
    eval_metrics_before = None
    if params.executor["is_debug_mode"]:
        eval_metrics_before = evaluation_metrics(before_model_embeddings,
                                                             normalized_all_targets,
                                                             subset_labels,
                                                             params,
                                                             distance_type=params.executor[
                                                                                            "distance_type"])

    return eval_metrics_after, eval_metrics_before
    def _load_train_data(self, load_with_file_path=False):
        if load_with_file_path:
            ## LOAD WITH FILE PATHS
            self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data_path(
                'train_loss')
            self._temp_train_source_labels = UTIL.load_embeddings(
                self._train_source_labels)
        else:
            ## LOAD WITH ACTUAL DATA
            self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data(
                'train_loss')
            self._temp_train_source_labels = self._train_source_labels
        self._train_baseline_source_embeddings = self._train_source_embeddings
        if self.params.model['model_type'].lower() == 'conv':
            tokenized_documents = self._obtain_tokenized_documents(
                self._train_source_indx)
            self._train_source_embeddings, self._train_source_embeddings_lengths = self._pad_documents(
                tokenized_documents)
            if load_with_file_path:
                ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE
                UTIL.dump_embeddings(self._train_source_embeddings,
                                     self._train_source_padded)
                UTIL.dump_embeddings(self._train_source_embeddings_lengths,
                                     self._train_source_length,
                                     dtype="int32")
                self._train_source_embeddings, self._train_source_embeddings_lengths = self._train_source_padded, self._train_source_length

        else:
            self._train_source_embeddings_lengths = np.zeros(
                [self._temp_train_source_labels.shape[0], 1])
            if load_with_file_path:
                ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE
                UTIL.dump_embeddings(self._train_source_embeddings_lengths,
                                     self._train_source_length,
                                     dtype="int32")
                self._train_source_embeddings_lengths = self._train_source_length
Esempio n. 7
0
def dump_splitted_train_test(question_embeddings, paragraph_embeddings, labels, prefix, path, partition_size):
    UTIL.dump_embeddings(labels['q'],
                    os.path.join(path, prefix + "_question_idx.hdf5"))
    UTIL.dump_embeddings(labels['p'],
                         os.path.join(path, prefix + "_question_labels.hdf5"),dtype='int32')
    range_size = math.ceil(question_embeddings.shape[0]/partition_size)
    for part in range(0, range_size):
        pair_paragraph_embeddings = None
        start = part * partition_size
        end = start + partition_size
        for q_indx, q_embed in tqdm(enumerate(question_embeddings[start:end])):
            if pair_paragraph_embeddings is None:
                pair_paragraph_embeddings = paragraph_embeddings[labels['p'][q_indx]]
            else:
                pair_paragraph_embeddings = np.vstack(
                    (pair_paragraph_embeddings, paragraph_embeddings[labels['p'][q_indx]]))
        UTIL.dump_embeddings(pair_paragraph_embeddings, os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part)))

    pair_paragraph_embeddings = None
    for part in range(0, range_size):
        embeddings = UTIL.load_embeddings(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part)))
        if pair_paragraph_embeddings is None:
            pair_paragraph_embeddings = embeddings
        else:
            pair_paragraph_embeddings = np.vstack(
                (pair_paragraph_embeddings,embeddings))

    UTIL.dump_embeddings(pair_paragraph_embeddings,
                         os.path.join(path, prefix + "_paired_paragraph_embeddings.hdf5"))

    for part in range(0, range_size):
        os.remove(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part)))

    UTIL.dump_embeddings(question_embeddings,
                    os.path.join(path,prefix + '_question_embeddings.hdf5'))
    UTIL.dump_embeddings(paragraph_embeddings, os.path.join(path, prefix + "_all_paragraph_embeddings.hdf5"))
Esempio n. 8
0
    init_op = tf.global_variables_initializer()

    # Merge all summary inforation.
    summary_op = tf.summary.merge_all()
    summaries_dir = os.path.join(params.executor["model_dir"], "low_level_log",
                                 'non_est_' + params.model["active_model"])
    # start the session
    with tf.Session() as sess:
        sess.run(init_op)
        train_writer = tf.summary.FileWriter(summaries_dir + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(summaries_dir + '/test')

        #TRAINING DATA
        training_question_embeddings = load_embeddings(
            os.path.join(base_data_path,
                         params.files['train_loss']['question_embeddings']))
        training_paragraph_embeddings = load_embeddings(
            os.path.join(base_data_path,
                         params.files['train_loss']['paragraph_embeddings']))
        training_labels = load_embeddings(
            os.path.join(base_data_path,
                         params.files['train_loss']['question_labels']))
        training_labels = np.reshape(training_labels, [-1, 1])
        # TESTING DATA
        testing_question_embeddings = load_embeddings(
            os.path.join(
                base_data_path,
                params.files['test_subset_loss']['question_embeddings']))
        testing_paragraph_embeddings = load_embeddings(
            os.path.join(
        " ".join(context) for context in tokenized_questions
    ]
    print('# of Tokenized Questions in {} : {}'.format(
        dataset_type, len(tokenized_questions)))
    #
    # if is_dump_during_execution:
    #     UTIL.dump_tokenized_contexts(tokenized_paragraphs, paragraphs_file.format(dataset_type))
    #     UTIL.dump_tokenized_contexts(tokenized_questions, questions_file.format(dataset_type))
    #     UTIL.dump_mapping_data(q_to_ps, mapping_file.format(dataset_type))
    end = datetime.datetime.now()
    print('Parsing Ended in {} minutes'.format((end - start).seconds / 60))
    print(100 * '*')
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    END: PARSING FILE
    ******************************************************************************************************************
    ******************************************************************************************************************
    """

    tokenized_questions, tokenized_paragraphs = UTIL.fixing_the_token_problem(
        tokenized_questions, tokenized_paragraphs)

else:

    s_to_ts = UTIL.load_embeddings(labels_path).astype(int)

target_embeddings = UTIL.load_embeddings(target_embeddings_path)
source_embeddings = UTIL.load_embeddings(source_embedddings_path)
calculate_similarity_and_dump(target_embeddings, source_embeddings, s_to_ts,
                              len(source_embeddings), recalls_path)
def main(args):

    ################ CONFIGURATIONS #################
    squad_formatted_file = os.path.join(args.data_path,
                                        args.squad_formatted_file)
    bert_extension = ".json"
    file_name_splitter = '_'
    document_embeddings = None
    questions_folder_path = os.path.join(args.data_path, 'questions')
    paragraphs_folder_path = os.path.join(args.data_path, 'paragraphs')
    new_question_tokens_path = os.path.join(args.data_path,
                                            'questions_tokens@@.pkl')
    new_paragraph_tokens_path = os.path.join(args.data_path,
                                             'paragraphs_tokens@@.pkl')
    calculated_token_embeddings_file_path = os.path.join(
        args.data_path,
        'contextualized_document_embeddings_with_token_##_@@.hdf5')
    vocab_path = os.path.join(args.data_path, 'wordpiece_vocab.txt')
    ind_layer = None
    conc_layers = None
    test_size = None
    if args.test_size is not None:
        test_size = [int(x) for x in args.test_size.split(",")]
    if args.ind_layer is not None:
        ind_layer = int(args.ind_layer)
        contextualized_questions_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5"
            .format(args.ind_layer))
        contextualized_paragraphs_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_paragraphs_embeddings_with_tokens_{}_layers.hdf5_@@"
            .format(args.ind_layer))
        contextualized_document_embeddings_with_token_path = os.path.join(
            args.data_path,
            "contextualized_document_embeddings_with_token_{}_layers.hdf5".
            format(args.ind_layer))
        final_questions_file_path = os.path.join(
            args.data_path,
            "question_document_embeddings_{}_layers_@@.hdf5".format(
                args.ind_layer))
        final_paragraphs_file_path = os.path.join(
            args.data_path,
            "paragraph_document_embeddings_{}_layers_@@.hdf5".format(
                args.ind_layer))
    else:
        conc_layers = [int(x) for x in args.conc_layers.split(",")]
        contextualized_questions_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5"
            .format(conc_layers))
        contextualized_paragraphs_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_paragraphs_embeddings_with_tokens_{}_layers_@@.hdf5"
            .format(conc_layers))
        contextualized_document_embeddings_with_token_path = os.path.join(
            args.data_path,
            "contextualized_document_embeddings_with_token_{}_layers.hdf5".
            format(conc_layers))
        final_questions_file_path = os.path.join(
            args.data_path,
            "question_document_embeddings_{}_layers_@@.hdf5".format(
                conc_layers))
        final_paragraphs_file_path = os.path.join(
            args.data_path,
            "paragraph_document_embeddings_{}_layers_@@.hdf5".format(
                conc_layers))
    if args.ind_layer is None and args.conc_layers is None:
        raise Exception('There must be some layer configurations !!!')
    if args.ind_layer is not None and args.conc_layers is not None:
        raise Exception('There must only one layer configuration !!!')
    # ################ CONFIGURATIONS #################
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: PARSING FILE
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    tokenized_questions, tokenized_paragraphs, questions_nontokenized, paragraphs_nontokenized = UTIL.prepare_squad_objects(
        squad_formatted_file, args.squad_formatted_file)
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    END: PARSING FILE
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: LOAD EMBEDINGS
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    new_question_tokens = []
    is_questions_already_processed = False
    if os.path.exists(
            contextualized_questions_with_token_file_path.replace('@@', '')):
        is_questions_already_processed = True
    else:
        file_names = get_file_names(questions_folder_path, file_name_splitter,
                                    bert_extension)
        tokenized_questions_size = test_size[
            0] if test_size is not None else len(tokenized_questions)
        checkpoint = None
        jsons = None
        if args.is_parititioned is True:
            partition_counter = 0
            for _p_counter in tqdm(
                    range(0, tokenized_questions_size,
                          args.document_partition_size)):
                print("Partition {} is running for writing questions".format(
                    partition_counter))
                if not os.path.exists(
                        contextualized_questions_with_token_file_path.replace(
                            '@@', str(partition_counter))):
                    # TOKEN DEBUGGING
                    #tokens_size_before_partition = sum([len(sentence) for sentence in new_question_tokens])
                    jsons, checkpoint, partition_shape = process_documents(
                        _p_counter, args.document_partition_size, checkpoint,
                        jsons, tokenized_questions_size, file_names,
                        questions_folder_path, ind_layer, conc_layers,
                        new_question_tokens,
                        contextualized_questions_with_token_file_path.replace(
                            '@@', str(partition_counter)),
                        new_question_tokens_path.replace(
                            '@@', str(partition_counter)))
                    # TOKEN DEBUGGING
                    # tokens_size_after_partition = sum([len(sentence) for sentence in new_question_tokens])
                    # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]:
                    #     print("*" * 25)
                    #     print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0]))
                    #     print("*" * 25)
                else:
                    new_question_tokens.extend(
                        UTIL.load_from_pickle(
                            new_question_tokens_path.replace(
                                '@@', str(partition_counter))))
                partition_counter += 1
            question_embeddings = None
            for _p_counter in tqdm(range(0, partition_counter)):
                print("Partition {} is running for reading questions".format(
                    partition_counter))
                temp_question_embeddings = UTIL.load_embeddings(
                    contextualized_questions_with_token_file_path.replace(
                        "@@", str(_p_counter)))
                if question_embeddings is None:
                    question_embeddings = temp_question_embeddings
                else:
                    question_embeddings = np.vstack(
                        (question_embeddings, temp_question_embeddings))
            print('MAIN embeddings shape: {}'.format(
                question_embeddings.shape))
            UTIL.dump_embeddings(
                question_embeddings,
                contextualized_questions_with_token_file_path.replace(
                    '@@', ''))
            print('MAIN embeddings are dumped')
        else:
            print("It is running for writing questions")
            jsons, checkpoint, partition_shape = process_documents(
                0, None, checkpoint, jsons, tokenized_questions_size,
                file_names, questions_folder_path, ind_layer, conc_layers,
                new_question_tokens,
                contextualized_questions_with_token_file_path.replace(
                    '@@', ''))
        UTIL.save_as_pickle(new_question_tokens,
                            new_question_tokens_path.replace('@@', ''))

    ## ***************************************************************************************************************
    ## ***************************************************************************************************************
    ## ***************************************************************************************************************
    new_paragraph_tokens = []
    is_paragraphs_already_processed = False
    if os.path.exists(
            contextualized_paragraphs_with_token_file_path.replace('@@', '')):
        is_paragraphs_already_processed = True
    else:
        file_names = get_file_names(paragraphs_folder_path, file_name_splitter,
                                    bert_extension)
        tokenized_paragraphs_size = test_size[
            1] if test_size is not None else len(tokenized_paragraphs)
        checkpoint = None
        jsons = None
        if args.is_parititioned is True:
            partition_counter = 0
            for _p_counter in tqdm(
                    range(0, tokenized_paragraphs_size,
                          args.document_partition_size)):
                print("Partition {} is running for writing paragraphs".format(
                    partition_counter))
                if not os.path.exists(
                        contextualized_paragraphs_with_token_file_path.replace(
                            '@@', str(partition_counter))):
                    #tokens_size_before_partition = sum([len(sentence) for sentence in new_paragraph_tokens])
                    jsons, checkpoint, partition_shape = process_documents(
                        _p_counter, args.document_partition_size, checkpoint,
                        jsons, tokenized_paragraphs_size, file_names,
                        paragraphs_folder_path, ind_layer, conc_layers,
                        new_paragraph_tokens,
                        contextualized_paragraphs_with_token_file_path.replace(
                            '@@', str(partition_counter)),
                        new_paragraph_tokens_path.replace(
                            '@@', str(partition_counter)))
                else:
                    new_paragraph_tokens.extend(
                        UTIL.load_from_pickle(
                            new_paragraph_tokens_path.replace(
                                '@@', str(partition_counter))))
                partition_counter += 1
                # TOKEN DEBUGGING
                # tokens_size_after_partition = sum([len(sentence) for sentence in new_paragraph_tokens])
                # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]:
                #     print("*" * 25)
                #     print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0]))
                #     print("*" * 25)
            paragraph_embeddings = None
            for _p_counter in tqdm(range(0, partition_counter)):
                print("Partition {} is running for reading paragraphs".format(
                    partition_counter))
                temp_paragraph_embeddings = UTIL.load_embeddings(
                    contextualized_paragraphs_with_token_file_path.replace(
                        "@@", str(_p_counter)))
                if paragraph_embeddings is None:
                    paragraph_embeddings = temp_paragraph_embeddings
                else:
                    paragraph_embeddings = np.vstack(
                        (paragraph_embeddings, temp_paragraph_embeddings))
            print('MAIN embeddings shape: {}'.format(
                paragraph_embeddings.shape))
            UTIL.dump_embeddings(
                paragraph_embeddings,
                contextualized_paragraphs_with_token_file_path.replace(
                    '@@', ''))
            print('MAIN embeddings are dumped')

        else:
            print("It is running for writing paragraphs")
            jsons, checkpoint, partition_shape = process_documents(
                0, None, checkpoint, jsons, tokenized_paragraphs_size,
                file_names, paragraphs_folder_path, ind_layer, conc_layers,
                new_paragraph_tokens,
                contextualized_paragraphs_with_token_file_path.replace(
                    '@@', ''))
        UTIL.save_as_pickle(new_paragraph_tokens,
                            new_paragraph_tokens_path.replace('@@', ''))

    if is_questions_already_processed:
        question_embeddings = UTIL.load_embeddings(
            contextualized_questions_with_token_file_path.replace('@@', ''))
        new_question_tokens = UTIL.load_from_pickle(
            new_question_tokens_path.replace('@@', ''))
    if is_paragraphs_already_processed:
        paragraph_embeddings = UTIL.load_embeddings(
            contextualized_paragraphs_with_token_file_path.replace('@@', ''))
        new_paragraph_tokens = UTIL.load_from_pickle(
            new_paragraph_tokens_path.replace('@@', ''))

    if os.path.exists(contextualized_document_embeddings_with_token_path):
        if args.is_parititioned is not True:
            document_embeddings = UTIL.load_embeddings(
                contextualized_document_embeddings_with_token_path)
    else:
        document_embeddings = np.vstack(
            (question_embeddings, paragraph_embeddings))
        UTIL.dump_embeddings(
            document_embeddings,
            contextualized_document_embeddings_with_token_path)
    del question_embeddings
    del paragraph_embeddings
    print('All Documents are dumped')
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    END: LOAD EMBEDINGS
    ******************************************************************************************************************
    ******************************************************************************************************************
    """

    document_embedding_guideline, corpus_as_tokens = UTIL.generate_document_embedding_guideline(
        new_question_tokens, new_paragraph_tokens)

    paragraphs_nontokenized = [
        " ".join(context) for context in new_paragraph_tokens
    ]
    questions_nontokenized = [
        " ".join(context) for context in new_question_tokens
    ]
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: IDF
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    if args.is_inject_idf:
        print('IDF is going to be calculated')
        # vocab = []
        # for sentence in new_question_tokens + new_paragraph_tokens:
        #     for word in sentence:
        #         vocab.append(word)
        # vocab = set(vocab)
        # UTIL.dump_vocab(vocab_path, vocab)
        #tokenize = wordpiece.FullTokenizer(vocab_file=vocab_path, do_lower_case=False)
        nlp = spacy.blank("en")
        tokenize = lambda doc: [token.text for token in nlp(doc)]
        start = datetime.datetime.now()
        token2idfweight, idf_vec = UTIL.transform_to_idf_weigths(
            new_question_tokens, new_paragraph_tokens, tokenize,
            questions_nontokenized, paragraphs_nontokenized)
        if args.is_parititioned is True:
            with h5py.File(contextualized_document_embeddings_with_token_path,
                           'r') as fin:
                partition_counter = 0
                for partition in range(0, idf_vec.shape[0],
                                       args.token_partition_size):
                    partition_counter += 1
                    temp_doc_embeddings = fin['embeddings'][
                        partition:partition + args.token_partition_size, :]
                    temp_idf_vec = idf_vec[
                        partition:partition +
                        args.token_partition_size, :].reshape(-1, 1)
                    #temp_doc_embeddings = temp_doc_embeddings[:,0,:]
                    #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2')
                    temp_weighted_token_embeddings = np.multiply(
                        temp_idf_vec, temp_doc_embeddings)
                    UTIL.dump_embeddings(
                        temp_weighted_token_embeddings,
                        calculated_token_embeddings_file_path.replace(
                            '@@', str(partition_counter)).replace('##', 'idf'))
                    print(
                        "Partition {} is completed and processed {} - {} tokens"
                        .format(partition_counter, partition,
                                partition + args.token_partition_size))
        else:
            idf_vec = idf_vec.reshape(-1, 1)
            weighted_token_embeddings = np.multiply(idf_vec,
                                                    document_embeddings)
        del idf_vec
        del token2idfweight
        end = datetime.datetime.now()
        print('IDF calculation is ended in {} minutes'.format(
            (end - start).seconds / 60))
    else:
        print('IDF is skipped')
        _type = 'only'
        if args.is_parititioned is True:
            with h5py.File(contextualized_document_embeddings_with_token_path,
                           'r') as fin:
                partition_counter = 0
                for partition in range(0, len(corpus_as_tokens),
                                       args.token_partition_size):
                    partition_counter += 1
                    temp_doc_embeddings = fin['embeddings'][
                        partition:partition + args.token_partition_size, :]
                    #temp_doc_embeddings = temp_doc_embeddings[:, 0, :]
                    #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2')
                    UTIL.dump_embeddings(
                        temp_doc_embeddings,
                        calculated_token_embeddings_file_path.replace(
                            '@@', str(partition_counter)).replace('##', ''))
                    print(
                        "Partition {} is completed and processed {} - {} tokens"
                        .format(partition_counter, partition,
                                partition + args.token_partition_size))
        else:
            weighted_token_embeddings = document_embeddings
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    END: LOAD IDF
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: WEIGHTED ARE GETTING APPLIED TO TOKEN EMBEDDINGS
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    del document_embeddings

    #LOAD PARTIAL FILES AFTER CLEANING THE DOCUMENT EMBEDDINGS.
    if args.is_parititioned is True:
        weighted_token_embeddings = None
        for partition in range(1, partition_counter + 1):
            temp_weighted_token_embeddings = UTIL.load_embeddings(
                calculated_token_embeddings_file_path.replace(
                    '@@', str(partition)).replace(
                        '##', 'idf' if args.is_inject_idf else ''))
            if weighted_token_embeddings is None:
                weighted_token_embeddings = temp_weighted_token_embeddings
            else:
                weighted_token_embeddings = np.vstack(
                    (weighted_token_embeddings,
                     temp_weighted_token_embeddings))
            print("Partition {} is loaded".format(partition))

    WM = None  #np.array(args['weights_arguments']).reshape((1, len(args['weights_arguments']), 1))
    questions_embeddings, paragraphs_embeddings = UTIL.token_to_document_embeddings(
        new_question_tokens, new_paragraph_tokens, weighted_token_embeddings,
        document_embedding_guideline, WM)

    if args.is_inject_idf:
        questions_elmo_embeddings = np.reshape(
            questions_embeddings,
            (questions_embeddings.shape[0], questions_embeddings.shape[1]))
        UTIL.dump_embeddings(
            questions_elmo_embeddings,
            final_questions_file_path.replace('@@', 'with_idf'))
        paragraphs_elmo_embeddings = np.reshape(
            paragraphs_embeddings,
            (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1]))
        UTIL.dump_embeddings(
            paragraphs_elmo_embeddings,
            final_paragraphs_file_path.replace('@@', 'with_idf'))
    else:
        questions_elmo_embeddings = np.reshape(
            questions_embeddings,
            (questions_embeddings.shape[0], questions_embeddings.shape[1]))
        UTIL.dump_embeddings(questions_elmo_embeddings,
                             final_questions_file_path.replace('@@', ''))
        paragraphs_elmo_embeddings = np.reshape(
            paragraphs_embeddings,
            (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1]))
        UTIL.dump_embeddings(paragraphs_elmo_embeddings,
                             final_paragraphs_file_path.replace('@@', ''))

    print('Weighted are applied')
    """
Esempio n. 11
0
def load_data(embedding_path, label_path, prefix):
    question_embeddings = UTIL.load_embeddings(os.path.join(embedding_path, prefix + '_question_embeddings.hdf5'))
    paragraph_embeddings = UTIL.load_embeddings(os.path.join(embedding_path, prefix + '_paragraph_embeddings.hdf5'))
    labels = pd.read_csv(os.path.join(label_path, prefix + '_question_labels.csv'))
    return question_embeddings, paragraph_embeddings, labels
******************************************************************************************************************
START: LOAD EMBEDINGS
******************************************************************************************************************
******************************************************************************************************************
"""
root_folder_path = os.path.join(datadir, args["root_path"])
document_embeddings = None
questions_folder_path = root_folder_path if args[
    "embedding_questions_path"] is None else os.path.join(
        root_folder_path, args["embedding_questions_path"])
question_embeddings = None
if os.path.exists(
        os.path.join(root_folder_path,
                     args['contextualized_questions_embeddings_with_token'])):
    question_embeddings = UTIL.load_embeddings(
        os.path.join(root_folder_path,
                     args['contextualized_questions_embeddings_with_token']))
else:
    for question_indx in range(len(tokenized_questions)):
        q_file_path = os.path.join(
            questions_folder_path,
            args['embedding_questions_file_pattern'].replace(
                '@@', str(question_indx)))
        question_embedding = UTIL.load_embeddings(q_file_path)
        if args['change_shape']:
            question_embedding = np.expand_dims(question_embedding, axis=1)
        if question_embeddings is None:
            question_embeddings = question_embedding
        else:
            question_embeddings = np.vstack(
                (question_embeddings, question_embedding))