def _load_test_recall_data(self, load_with_file_path=False):
     if load_with_file_path:
         self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data_path(
             'test_subset_recall')
         self._temp_test_recall_source_labels = UTIL.load_embeddings(
             self._test_recall_source_labels)
     else:
         self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data(
             'test_subset_recall')
         self._temp_test_recall_source_labels = self._test_recall_source_labels
     self._test_recall_baseline_source_embeddings = self._test_recall_source_embeddings
     if self.params.model['model_type'].lower() == 'conv':
         tokenized_documents = self._obtain_tokenized_documents(
             self._test_recall_source_indx)
         self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._pad_documents(
             tokenized_documents)
         if load_with_file_path:
             UTIL.dump_embeddings(self._test_recall_source_embeddings,
                                  self._test_recall_source_padded)
             UTIL.dump_embeddings(
                 self._test_recall_source_embeddings_lengths,
                 self._test_recall_source_length,
                 dtype="int32")
             self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._test_recall_source_padded, self._test_recall_source_length
     else:
         self._test_recall_source_embeddings_lengths = np.zeros(
             [self._temp_test_recall_source_labels.shape[0], 1])
         if load_with_file_path:
             UTIL.dump_embeddings(
                 self._test_recall_source_embeddings_lengths,
                 self._test_recall_source_length,
                 dtype="int32")
             self._test_recall_source_embeddings_lengths = self._test_recall_source_length
    def _load_predict_data(self, load_with_file_path=False):
        if load_with_file_path:
            self._source_embeddings = os.path.join(
                self.base_path,
                self.params.files['prediction']['source_embeddings'])
            self._source_padded = os.path.join(
                self.base_path,
                self.params.files['prediction']['source_padded'])
            self._source_length = os.path.join(
                self.base_path,
                self.params.files['prediction']['source_length'])
            self._baseline_source_embeddings = self._source_embeddings
            self._temp_source_embeddings = UTIL.load_embeddings(
                self._source_embeddings)
        else:
            self._source_embeddings = UTIL.load_embeddings(
                os.path.join(
                    self.base_path,
                    self.params.files['prediction']['source_embeddings']))
            self._source_padded = None
            self._source_length = None
            self._baseline_source_embeddings = self._source_embeddings
            self._temp_source_embeddings = self._source_embeddings

        if self.KN_FILE_NAMES['DIR'].lower().startswith('qu'):
            tokenized_documents = self._tokenized_questions
        else:
            tokenized_documents = self._tokenized_paragraphs
        if self.params.model['model_type'].lower() == 'conv':
            self._source_embeddings, self._source_embeddings_lengths = self._pad_documents(
                tokenized_documents)
            if load_with_file_path:
                UTIL.dump_embeddings(self._source_embeddings,
                                     self._source_padded)
                UTIL.dump_embeddings(self._source_embeddings_lengths,
                                     self._source_length,
                                     dtype="int32")
                self._source_embeddings, self._source_embeddings_lengths = self._source_padded, self._source_length
        else:
            self._source_embeddings_lengths = np.zeros(
                [self._temp_source_embeddings.shape[0], 1])
            if load_with_file_path:
                UTIL.dump_embeddings(self._source_embeddings_lengths,
                                     self._source_length,
                                     dtype="int32")
                self._source_embeddings_lengths = self._source_length
    def _load_train_data(self, load_with_file_path=False):
        if load_with_file_path:
            ## LOAD WITH FILE PATHS
            self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data_path(
                'train_loss')
            self._temp_train_source_labels = UTIL.load_embeddings(
                self._train_source_labels)
        else:
            ## LOAD WITH ACTUAL DATA
            self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data(
                'train_loss')
            self._temp_train_source_labels = self._train_source_labels
        self._train_baseline_source_embeddings = self._train_source_embeddings
        if self.params.model['model_type'].lower() == 'conv':
            tokenized_documents = self._obtain_tokenized_documents(
                self._train_source_indx)
            self._train_source_embeddings, self._train_source_embeddings_lengths = self._pad_documents(
                tokenized_documents)
            if load_with_file_path:
                ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE
                UTIL.dump_embeddings(self._train_source_embeddings,
                                     self._train_source_padded)
                UTIL.dump_embeddings(self._train_source_embeddings_lengths,
                                     self._train_source_length,
                                     dtype="int32")
                self._train_source_embeddings, self._train_source_embeddings_lengths = self._train_source_padded, self._train_source_length

        else:
            self._train_source_embeddings_lengths = np.zeros(
                [self._temp_train_source_labels.shape[0], 1])
            if load_with_file_path:
                ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE
                UTIL.dump_embeddings(self._train_source_embeddings_lengths,
                                     self._train_source_length,
                                     dtype="int32")
                self._train_source_embeddings_lengths = self._train_source_length
                           signature="default",
                           as_dict=True)['elmo'])
            # for i, each_document in enumerate(tqdm(tokenized[begin_index:end_index],
            #                                        total=len(tokenized[begin_index:end_index])), begin_index):

            for doc_index, embed_document in enumerate(
                    enumerate(documents[begin_index:end_index]), begin_index):

                try:
                    embed_index, each_document = embed_document
                    _begining = 0
                    _ending = len(tokenized_documents[doc_index])
                    _d1 = d1[embed_index, _begining:_ending, :]
                    _d1 = np.expand_dims(_d1, axis=1)
                    UTIL.dump_embeddings(
                        _d1,
                        embedding_file.replace('@', 'LSTM1_' + str(doc_index)))
                    _d2 = d2[embed_index, _begining:_ending, :]
                    _d2 = np.expand_dims(_d2, axis=1)
                    UTIL.dump_embeddings(
                        _d2,
                        embedding_file.replace('@', 'LSTM2_' + str(doc_index)))
                    _delmo = delmo[embed_index, _begining:_ending, :]
                    _delmo = np.expand_dims(_delmo, axis=1)
                    UTIL.dump_embeddings(
                        _delmo,
                        embedding_file.replace('@', 'ELMO_' + str(doc_index)))
                except Exception as ex:
                    print(ex)
                    print('End of documents')
Ejemplo n.º 5
0
def execute_non_conv_pipeline(params, base_data_path, config, tf, databuilder):
    estimator = tf.estimator.Estimator(model_fn, params=params, config=config)
    if not params.executor["is_prediction"]:
        _train_input_fn = lambda: databuilder.train_input_fn()
        if params.executor["recall_calculation_for"] == 'test':
            _recall_input_fn = lambda: databuilder.test_recall_input_fn()
        else:
            _recall_input_fn = lambda: databuilder.train_recall_input_fn()


        if not params.executor["is_debug_mode"]:
            # -------------------------------------
            # Train the model
            # -------------------------------------
            tf.logging.info("Starting training for {} epoch(s).".format(params.model["num_epochs"]))
            tf.logging.info(
                "Train loss on train set with a size of {} ".format(params.files["splitter"]["train_size"]))
                # -------------------------------------
                # First Train
                # -------------------------------------
            #estimator.train(_train_input_fn, max_steps=1)

                # -------------------------------------
                # Train and Test : Train
                # -------------------------------------
            # train_spec = tf.estimator.TrainSpec(_train_input_fn,
            #                                     max_steps= params.model['num_epochs'] * math.ceil(params.files['splitter']['train_size'] / params.model['batch_size']))

            train_spec = tf.estimator.TrainSpec(_train_input_fn)


            # -------------------------------------
            # Test the model
            # -------------------------------------
            tf.logging.info("Evaluation loss and recall loss on test set.")
            tf.logging.info(
                "Evaluation loss on test set with a size of {} ".format(params.files["splitter"]["test_size"]))
            tf.logging.info(
                "Evaluation recall loss on test set with a size of {} ".format(params.files["splitter"]["test_subset_size"]))

            # -------------------------------------
            # Baseline Eval for Initial model
            # -------------------------------------
                # -------------------------------------
                # Then Test
                # -------------------------------------
            #estimator.evaluate(_recall_input_fn)

                # -------------------------------------
                # Train and Test : Test
                # -------------------------------------
            test_spec = tf.estimator.EvalSpec(_recall_input_fn) #steps=params.model['num_epochs']

            # -------------------------------------
            # Train and Test
            # -------------------------------------
            tf.estimator.train_and_evaluate(estimator, train_spec, test_spec)


            if params.executor["is_prediction_during_training"]:

                predictions = estimator.predict(lambda: databuilder.predict_input_fn())
                predictions = np.array(list(predictions))
                dump_embeddings(predictions, os.path.join(base_data_path, "improved_" + params.files["prediction"][
                    "source_embeddings"]))
        else:
            tf.logging.info(10 * '*')
            tf.logging.info("Starting debugging for {} epoch(s).".format(params.model["num_epochs"]))
            data_dict = {}
            data_dict['epochs'] = params.model["num_epochs"]
            for ep in range(1,params.model["num_epochs"] + 1):
                tf.logging.info("-------> Epoch: {}".format(ep))
                # if ep < 1:
                #     estimator.train(_train_input_fn, max_steps=1)
                # else:
                estimator.train(_train_input_fn)
                tf.logging.info("-------> Epoch: {} Train is completed".format(ep))
                result_as_dict_list = estimator.predict(_recall_input_fn)
                tf.logging.info("-------> Epoch: {} Predict is completed".format(ep))
                data_dict = prepare_dict_to_print(result_as_dict_list, data_dict,ep)
                tf.logging.info("-------> Epoch: {} data_dict is completed".format(ep))

            save_as_pickle(data_dict, os.path.join(model_save_path, 'debug_dict.pkl'))
            save_as_shelve(data_dict, os.path.join(model_save_path, 'debug_dict.slv'))
            tf.logging.info('Dict objs are saved.')
            tf.logging.info(10 * '*')


    # -------------------------------------
    # Prediction
    # -------------------------------------
    else:
        # ALL PARAMETERS SHOULD BE SET SAME WITH THE SAVED MODEL :(
        # I AM GOING TO HANDLE HOW TO SAVE IT WITH THE PARAM CONFS.
        predictions = estimator.predict(lambda: databuilder.predict_input_fn())
        predictions = np.array(list(predictions))
        dump_embeddings(predictions,
                        os.path.join(base_data_path, "improved_" + params.files["prediction"]["source_embeddings"]))
def process_documents(partition, document_partition_size, checkpoint, jsons,
                      tokenized_document_size, file_names, file_folder_path,
                      ind_layer, conc_layers, all_tokens,
                      contextualized_questions_with_token_file_path,
                      dictionary_path):
    embeddings = None
    start = partition
    end = (
        partition + document_partition_size
    ) if document_partition_size is not None else tokenized_document_size
    local_tokens = []
    for indx in tqdm(range(start, end)):
        if tokenized_document_size > indx:
            bert_index = indx + 1
            file_name, remaining_index_to_pass_this_file = find_file_name(
                bert_index, file_names)
            if remaining_index_to_pass_this_file >= 0:
                jsons = UTIL.load_bert_jsons_from_single_file(
                    os.path.join(file_folder_path, file_name))
                if indx > 0:
                    checkpoint = indx
            if checkpoint is not None:
                indx = indx - checkpoint
            new_token = []
            token_embeddings = None
            for line_index, json in UTIL.reversedEnumerate(jsons[indx]):
                # 0 and -1 token indexes belong to [CLS, SEP] we are ignoring them.
                json['features'].pop(0)
                json['features'].pop(-1)

                # filter out the non-contributional tokens from the list.
                features = [
                    x for x in json['features']
                    if not x['token'].startswith("##")
                ]
                for feature_index, feature in UTIL.reversedEnumerate(features):
                    if line_index > 0 and feature_index < args.window_length:
                        # print(feature['token'])
                        continue

                    if args.ind_layer is not None:
                        token_embedding = np.array([
                            l['values'] for l in feature['layers']
                            if l['index'] == ind_layer
                        ])
                    else:
                        token_embedding = np.concatenate([
                            l['values'] for l in feature['layers']
                            if l['index'] in conc_layers
                        ])

                    if token_embeddings is None:
                        token_embeddings = token_embedding
                    else:
                        token_embeddings = np.vstack(
                            (token_embeddings, token_embedding))

                    new_token.append(feature['token'])
            if len(new_token) != token_embeddings.shape[0]:
                print(30 * '*')
                print(
                    '********** Size of token embeddings {} has problem in {} checkpoint **********'
                    .format(indx, checkpoint))
                print(30 * '*')
            all_tokens.append(new_token)
            local_tokens.append(new_token)
            # TOKEN DEBUGGING
            # print("*" * 25)
            # print("Sub Token size in dictionary for the document {} in the partition: {}".format(indx, len(new_token)))
            # print("Total Token size in dictionary after the document {} in the partition: {}".format(indx, sum([len(sentence) for sentence in all_tokens])))
            # print("*" * 25)
            if embeddings is None:
                embeddings = token_embeddings
            else:
                embeddings = np.vstack((embeddings, token_embeddings))

    print('embeddings shape: {}'.format(embeddings.shape))
    UTIL.dump_embeddings(embeddings,
                         contextualized_questions_with_token_file_path)
    UTIL.save_as_pickle(local_tokens, dictionary_path)
    print('embeddings are dumped')
    return jsons, checkpoint, embeddings.shape  ## TOKEN DEBUGGING
def main(args):

    ################ CONFIGURATIONS #################
    squad_formatted_file = os.path.join(args.data_path,
                                        args.squad_formatted_file)
    bert_extension = ".json"
    file_name_splitter = '_'
    document_embeddings = None
    questions_folder_path = os.path.join(args.data_path, 'questions')
    paragraphs_folder_path = os.path.join(args.data_path, 'paragraphs')
    new_question_tokens_path = os.path.join(args.data_path,
                                            'questions_tokens@@.pkl')
    new_paragraph_tokens_path = os.path.join(args.data_path,
                                             'paragraphs_tokens@@.pkl')
    calculated_token_embeddings_file_path = os.path.join(
        args.data_path,
        'contextualized_document_embeddings_with_token_##_@@.hdf5')
    vocab_path = os.path.join(args.data_path, 'wordpiece_vocab.txt')
    ind_layer = None
    conc_layers = None
    test_size = None
    if args.test_size is not None:
        test_size = [int(x) for x in args.test_size.split(",")]
    if args.ind_layer is not None:
        ind_layer = int(args.ind_layer)
        contextualized_questions_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5"
            .format(args.ind_layer))
        contextualized_paragraphs_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_paragraphs_embeddings_with_tokens_{}_layers.hdf5_@@"
            .format(args.ind_layer))
        contextualized_document_embeddings_with_token_path = os.path.join(
            args.data_path,
            "contextualized_document_embeddings_with_token_{}_layers.hdf5".
            format(args.ind_layer))
        final_questions_file_path = os.path.join(
            args.data_path,
            "question_document_embeddings_{}_layers_@@.hdf5".format(
                args.ind_layer))
        final_paragraphs_file_path = os.path.join(
            args.data_path,
            "paragraph_document_embeddings_{}_layers_@@.hdf5".format(
                args.ind_layer))
    else:
        conc_layers = [int(x) for x in args.conc_layers.split(",")]
        contextualized_questions_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5"
            .format(conc_layers))
        contextualized_paragraphs_with_token_file_path = os.path.join(
            args.data_path,
            "contextualized_paragraphs_embeddings_with_tokens_{}_layers_@@.hdf5"
            .format(conc_layers))
        contextualized_document_embeddings_with_token_path = os.path.join(
            args.data_path,
            "contextualized_document_embeddings_with_token_{}_layers.hdf5".
            format(conc_layers))
        final_questions_file_path = os.path.join(
            args.data_path,
            "question_document_embeddings_{}_layers_@@.hdf5".format(
                conc_layers))
        final_paragraphs_file_path = os.path.join(
            args.data_path,
            "paragraph_document_embeddings_{}_layers_@@.hdf5".format(
                conc_layers))
    if args.ind_layer is None and args.conc_layers is None:
        raise Exception('There must be some layer configurations !!!')
    if args.ind_layer is not None and args.conc_layers is not None:
        raise Exception('There must only one layer configuration !!!')
    # ################ CONFIGURATIONS #################
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: PARSING FILE
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    tokenized_questions, tokenized_paragraphs, questions_nontokenized, paragraphs_nontokenized = UTIL.prepare_squad_objects(
        squad_formatted_file, args.squad_formatted_file)
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    END: PARSING FILE
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: LOAD EMBEDINGS
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    new_question_tokens = []
    is_questions_already_processed = False
    if os.path.exists(
            contextualized_questions_with_token_file_path.replace('@@', '')):
        is_questions_already_processed = True
    else:
        file_names = get_file_names(questions_folder_path, file_name_splitter,
                                    bert_extension)
        tokenized_questions_size = test_size[
            0] if test_size is not None else len(tokenized_questions)
        checkpoint = None
        jsons = None
        if args.is_parititioned is True:
            partition_counter = 0
            for _p_counter in tqdm(
                    range(0, tokenized_questions_size,
                          args.document_partition_size)):
                print("Partition {} is running for writing questions".format(
                    partition_counter))
                if not os.path.exists(
                        contextualized_questions_with_token_file_path.replace(
                            '@@', str(partition_counter))):
                    # TOKEN DEBUGGING
                    #tokens_size_before_partition = sum([len(sentence) for sentence in new_question_tokens])
                    jsons, checkpoint, partition_shape = process_documents(
                        _p_counter, args.document_partition_size, checkpoint,
                        jsons, tokenized_questions_size, file_names,
                        questions_folder_path, ind_layer, conc_layers,
                        new_question_tokens,
                        contextualized_questions_with_token_file_path.replace(
                            '@@', str(partition_counter)),
                        new_question_tokens_path.replace(
                            '@@', str(partition_counter)))
                    # TOKEN DEBUGGING
                    # tokens_size_after_partition = sum([len(sentence) for sentence in new_question_tokens])
                    # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]:
                    #     print("*" * 25)
                    #     print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0]))
                    #     print("*" * 25)
                else:
                    new_question_tokens.extend(
                        UTIL.load_from_pickle(
                            new_question_tokens_path.replace(
                                '@@', str(partition_counter))))
                partition_counter += 1
            question_embeddings = None
            for _p_counter in tqdm(range(0, partition_counter)):
                print("Partition {} is running for reading questions".format(
                    partition_counter))
                temp_question_embeddings = UTIL.load_embeddings(
                    contextualized_questions_with_token_file_path.replace(
                        "@@", str(_p_counter)))
                if question_embeddings is None:
                    question_embeddings = temp_question_embeddings
                else:
                    question_embeddings = np.vstack(
                        (question_embeddings, temp_question_embeddings))
            print('MAIN embeddings shape: {}'.format(
                question_embeddings.shape))
            UTIL.dump_embeddings(
                question_embeddings,
                contextualized_questions_with_token_file_path.replace(
                    '@@', ''))
            print('MAIN embeddings are dumped')
        else:
            print("It is running for writing questions")
            jsons, checkpoint, partition_shape = process_documents(
                0, None, checkpoint, jsons, tokenized_questions_size,
                file_names, questions_folder_path, ind_layer, conc_layers,
                new_question_tokens,
                contextualized_questions_with_token_file_path.replace(
                    '@@', ''))
        UTIL.save_as_pickle(new_question_tokens,
                            new_question_tokens_path.replace('@@', ''))

    ## ***************************************************************************************************************
    ## ***************************************************************************************************************
    ## ***************************************************************************************************************
    new_paragraph_tokens = []
    is_paragraphs_already_processed = False
    if os.path.exists(
            contextualized_paragraphs_with_token_file_path.replace('@@', '')):
        is_paragraphs_already_processed = True
    else:
        file_names = get_file_names(paragraphs_folder_path, file_name_splitter,
                                    bert_extension)
        tokenized_paragraphs_size = test_size[
            1] if test_size is not None else len(tokenized_paragraphs)
        checkpoint = None
        jsons = None
        if args.is_parititioned is True:
            partition_counter = 0
            for _p_counter in tqdm(
                    range(0, tokenized_paragraphs_size,
                          args.document_partition_size)):
                print("Partition {} is running for writing paragraphs".format(
                    partition_counter))
                if not os.path.exists(
                        contextualized_paragraphs_with_token_file_path.replace(
                            '@@', str(partition_counter))):
                    #tokens_size_before_partition = sum([len(sentence) for sentence in new_paragraph_tokens])
                    jsons, checkpoint, partition_shape = process_documents(
                        _p_counter, args.document_partition_size, checkpoint,
                        jsons, tokenized_paragraphs_size, file_names,
                        paragraphs_folder_path, ind_layer, conc_layers,
                        new_paragraph_tokens,
                        contextualized_paragraphs_with_token_file_path.replace(
                            '@@', str(partition_counter)),
                        new_paragraph_tokens_path.replace(
                            '@@', str(partition_counter)))
                else:
                    new_paragraph_tokens.extend(
                        UTIL.load_from_pickle(
                            new_paragraph_tokens_path.replace(
                                '@@', str(partition_counter))))
                partition_counter += 1
                # TOKEN DEBUGGING
                # tokens_size_after_partition = sum([len(sentence) for sentence in new_paragraph_tokens])
                # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]:
                #     print("*" * 25)
                #     print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0]))
                #     print("*" * 25)
            paragraph_embeddings = None
            for _p_counter in tqdm(range(0, partition_counter)):
                print("Partition {} is running for reading paragraphs".format(
                    partition_counter))
                temp_paragraph_embeddings = UTIL.load_embeddings(
                    contextualized_paragraphs_with_token_file_path.replace(
                        "@@", str(_p_counter)))
                if paragraph_embeddings is None:
                    paragraph_embeddings = temp_paragraph_embeddings
                else:
                    paragraph_embeddings = np.vstack(
                        (paragraph_embeddings, temp_paragraph_embeddings))
            print('MAIN embeddings shape: {}'.format(
                paragraph_embeddings.shape))
            UTIL.dump_embeddings(
                paragraph_embeddings,
                contextualized_paragraphs_with_token_file_path.replace(
                    '@@', ''))
            print('MAIN embeddings are dumped')

        else:
            print("It is running for writing paragraphs")
            jsons, checkpoint, partition_shape = process_documents(
                0, None, checkpoint, jsons, tokenized_paragraphs_size,
                file_names, paragraphs_folder_path, ind_layer, conc_layers,
                new_paragraph_tokens,
                contextualized_paragraphs_with_token_file_path.replace(
                    '@@', ''))
        UTIL.save_as_pickle(new_paragraph_tokens,
                            new_paragraph_tokens_path.replace('@@', ''))

    if is_questions_already_processed:
        question_embeddings = UTIL.load_embeddings(
            contextualized_questions_with_token_file_path.replace('@@', ''))
        new_question_tokens = UTIL.load_from_pickle(
            new_question_tokens_path.replace('@@', ''))
    if is_paragraphs_already_processed:
        paragraph_embeddings = UTIL.load_embeddings(
            contextualized_paragraphs_with_token_file_path.replace('@@', ''))
        new_paragraph_tokens = UTIL.load_from_pickle(
            new_paragraph_tokens_path.replace('@@', ''))

    if os.path.exists(contextualized_document_embeddings_with_token_path):
        if args.is_parititioned is not True:
            document_embeddings = UTIL.load_embeddings(
                contextualized_document_embeddings_with_token_path)
    else:
        document_embeddings = np.vstack(
            (question_embeddings, paragraph_embeddings))
        UTIL.dump_embeddings(
            document_embeddings,
            contextualized_document_embeddings_with_token_path)
    del question_embeddings
    del paragraph_embeddings
    print('All Documents are dumped')
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    END: LOAD EMBEDINGS
    ******************************************************************************************************************
    ******************************************************************************************************************
    """

    document_embedding_guideline, corpus_as_tokens = UTIL.generate_document_embedding_guideline(
        new_question_tokens, new_paragraph_tokens)

    paragraphs_nontokenized = [
        " ".join(context) for context in new_paragraph_tokens
    ]
    questions_nontokenized = [
        " ".join(context) for context in new_question_tokens
    ]
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: IDF
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    if args.is_inject_idf:
        print('IDF is going to be calculated')
        # vocab = []
        # for sentence in new_question_tokens + new_paragraph_tokens:
        #     for word in sentence:
        #         vocab.append(word)
        # vocab = set(vocab)
        # UTIL.dump_vocab(vocab_path, vocab)
        #tokenize = wordpiece.FullTokenizer(vocab_file=vocab_path, do_lower_case=False)
        nlp = spacy.blank("en")
        tokenize = lambda doc: [token.text for token in nlp(doc)]
        start = datetime.datetime.now()
        token2idfweight, idf_vec = UTIL.transform_to_idf_weigths(
            new_question_tokens, new_paragraph_tokens, tokenize,
            questions_nontokenized, paragraphs_nontokenized)
        if args.is_parititioned is True:
            with h5py.File(contextualized_document_embeddings_with_token_path,
                           'r') as fin:
                partition_counter = 0
                for partition in range(0, idf_vec.shape[0],
                                       args.token_partition_size):
                    partition_counter += 1
                    temp_doc_embeddings = fin['embeddings'][
                        partition:partition + args.token_partition_size, :]
                    temp_idf_vec = idf_vec[
                        partition:partition +
                        args.token_partition_size, :].reshape(-1, 1)
                    #temp_doc_embeddings = temp_doc_embeddings[:,0,:]
                    #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2')
                    temp_weighted_token_embeddings = np.multiply(
                        temp_idf_vec, temp_doc_embeddings)
                    UTIL.dump_embeddings(
                        temp_weighted_token_embeddings,
                        calculated_token_embeddings_file_path.replace(
                            '@@', str(partition_counter)).replace('##', 'idf'))
                    print(
                        "Partition {} is completed and processed {} - {} tokens"
                        .format(partition_counter, partition,
                                partition + args.token_partition_size))
        else:
            idf_vec = idf_vec.reshape(-1, 1)
            weighted_token_embeddings = np.multiply(idf_vec,
                                                    document_embeddings)
        del idf_vec
        del token2idfweight
        end = datetime.datetime.now()
        print('IDF calculation is ended in {} minutes'.format(
            (end - start).seconds / 60))
    else:
        print('IDF is skipped')
        _type = 'only'
        if args.is_parititioned is True:
            with h5py.File(contextualized_document_embeddings_with_token_path,
                           'r') as fin:
                partition_counter = 0
                for partition in range(0, len(corpus_as_tokens),
                                       args.token_partition_size):
                    partition_counter += 1
                    temp_doc_embeddings = fin['embeddings'][
                        partition:partition + args.token_partition_size, :]
                    #temp_doc_embeddings = temp_doc_embeddings[:, 0, :]
                    #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2')
                    UTIL.dump_embeddings(
                        temp_doc_embeddings,
                        calculated_token_embeddings_file_path.replace(
                            '@@', str(partition_counter)).replace('##', ''))
                    print(
                        "Partition {} is completed and processed {} - {} tokens"
                        .format(partition_counter, partition,
                                partition + args.token_partition_size))
        else:
            weighted_token_embeddings = document_embeddings
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    END: LOAD IDF
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    """
    ******************************************************************************************************************
    ******************************************************************************************************************
    START: WEIGHTED ARE GETTING APPLIED TO TOKEN EMBEDDINGS
    ******************************************************************************************************************
    ******************************************************************************************************************
    """
    del document_embeddings

    #LOAD PARTIAL FILES AFTER CLEANING THE DOCUMENT EMBEDDINGS.
    if args.is_parititioned is True:
        weighted_token_embeddings = None
        for partition in range(1, partition_counter + 1):
            temp_weighted_token_embeddings = UTIL.load_embeddings(
                calculated_token_embeddings_file_path.replace(
                    '@@', str(partition)).replace(
                        '##', 'idf' if args.is_inject_idf else ''))
            if weighted_token_embeddings is None:
                weighted_token_embeddings = temp_weighted_token_embeddings
            else:
                weighted_token_embeddings = np.vstack(
                    (weighted_token_embeddings,
                     temp_weighted_token_embeddings))
            print("Partition {} is loaded".format(partition))

    WM = None  #np.array(args['weights_arguments']).reshape((1, len(args['weights_arguments']), 1))
    questions_embeddings, paragraphs_embeddings = UTIL.token_to_document_embeddings(
        new_question_tokens, new_paragraph_tokens, weighted_token_embeddings,
        document_embedding_guideline, WM)

    if args.is_inject_idf:
        questions_elmo_embeddings = np.reshape(
            questions_embeddings,
            (questions_embeddings.shape[0], questions_embeddings.shape[1]))
        UTIL.dump_embeddings(
            questions_elmo_embeddings,
            final_questions_file_path.replace('@@', 'with_idf'))
        paragraphs_elmo_embeddings = np.reshape(
            paragraphs_embeddings,
            (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1]))
        UTIL.dump_embeddings(
            paragraphs_elmo_embeddings,
            final_paragraphs_file_path.replace('@@', 'with_idf'))
    else:
        questions_elmo_embeddings = np.reshape(
            questions_embeddings,
            (questions_embeddings.shape[0], questions_embeddings.shape[1]))
        UTIL.dump_embeddings(questions_elmo_embeddings,
                             final_questions_file_path.replace('@@', ''))
        paragraphs_elmo_embeddings = np.reshape(
            paragraphs_embeddings,
            (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1]))
        UTIL.dump_embeddings(paragraphs_elmo_embeddings,
                             final_paragraphs_file_path.replace('@@', ''))

    print('Weighted are applied')
    """
Ejemplo n.º 8
0
def dump_splitted_train_test(question_embeddings, paragraph_embeddings, labels, prefix, path, partition_size):
    UTIL.dump_embeddings(labels['q'],
                    os.path.join(path, prefix + "_question_idx.hdf5"))
    UTIL.dump_embeddings(labels['p'],
                         os.path.join(path, prefix + "_question_labels.hdf5"),dtype='int32')
    range_size = math.ceil(question_embeddings.shape[0]/partition_size)
    for part in range(0, range_size):
        pair_paragraph_embeddings = None
        start = part * partition_size
        end = start + partition_size
        for q_indx, q_embed in tqdm(enumerate(question_embeddings[start:end])):
            if pair_paragraph_embeddings is None:
                pair_paragraph_embeddings = paragraph_embeddings[labels['p'][q_indx]]
            else:
                pair_paragraph_embeddings = np.vstack(
                    (pair_paragraph_embeddings, paragraph_embeddings[labels['p'][q_indx]]))
        UTIL.dump_embeddings(pair_paragraph_embeddings, os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part)))

    pair_paragraph_embeddings = None
    for part in range(0, range_size):
        embeddings = UTIL.load_embeddings(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part)))
        if pair_paragraph_embeddings is None:
            pair_paragraph_embeddings = embeddings
        else:
            pair_paragraph_embeddings = np.vstack(
                (pair_paragraph_embeddings,embeddings))

    UTIL.dump_embeddings(pair_paragraph_embeddings,
                         os.path.join(path, prefix + "_paired_paragraph_embeddings.hdf5"))

    for part in range(0, range_size):
        os.remove(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part)))

    UTIL.dump_embeddings(question_embeddings,
                    os.path.join(path,prefix + '_question_embeddings.hdf5'))
    UTIL.dump_embeddings(paragraph_embeddings, os.path.join(path, prefix + "_all_paragraph_embeddings.hdf5"))
Ejemplo n.º 9
0
        token2idfweight, idf_vec = UTIL.transform_to_idf_weigths(tokenized_questions,
                                                                 tokenized_paragraphs,
                                                                 tokenize,
                                                                 questions_nontokenized,
                                                                 paragraphs_nontokenized)

        mean_glove_with_idf_embeddings = np.array([
            np.mean([glove_word_weights[w] * token2idfweight[w]
                     for w in words if w in glove_word_weights] or
                    [np.zeros(dim)], axis=0)
            for words in tokenized_questions + tokenized_paragraphs
        ])
        # UTIL.dump_embeddings(mean_glove_with_idf_embeddings,
        #                 os.path.join(datadir, 'dev_mean_glove_with_idf_embeddings.hdf5'))
        question_embeddings = mean_glove_with_idf_embeddings[0:len(tokenized_questions), :]
        UTIL.dump_embeddings(question_embeddings, os.path.join(datadir, '{}_glove_questions_embeddings_with_idf.hdf5'.format(dataset_type)))
        paragraphs_embeddings = mean_glove_with_idf_embeddings[len(tokenized_questions):, :]
        UTIL.dump_embeddings(paragraphs_embeddings, os.path.join(datadir, '{}_glove_paragraphs_embeddings_with_idf.hdf5'.format(dataset_type)))
        end = datetime.datetime.now()
        print('IDF calculation is ended in {} minutes'.format((end - start).seconds / 60))
    else:
        print('IDF is skipped')
        mean_glove_embeddings = np.array([
            np.mean([glove_word_weights[w] for w in words if w in glove_word_weights]
                    or [np.zeros(dim)], axis=0)
            for words in tokenized_questions + tokenized_paragraphs
        ])
        #UTIL.dump_embeddings(mean_glove_embeddings, os.path.join(datadir, 'dev_mean_glove_embeddings.hdf5'))
        question_embeddings = mean_glove_embeddings[0:len(tokenized_questions), :]
        UTIL.dump_embeddings(question_embeddings, os.path.join(datadir, '{}_glove_questions_embeddings.hdf5'.format(dataset_type)))
        paragraphs_embeddings = mean_glove_embeddings[len(tokenized_questions):, :]
               ) != question_embedding.shape[0]:
            print(30 * '*')
            print('********** Question {} has problem **********'.format(
                question_indx))
            print(30 * '*')

        print(
            'Question {} is processed. It has {} tokens and embedding shape is {} so {}'
            .format(
                question_indx, len(tokenized_questions[question_indx]),
                question_embedding.shape[0],
                len(tokenized_questions[question_indx]) ==
                question_embedding.shape[0]))
    print('Question_embeddings shape: {}'.format(question_embeddings.shape))
    UTIL.dump_embeddings(
        question_embeddings,
        os.path.join(root_folder_path,
                     args['contextualized_questions_embeddings_with_token']))
    print('Questions are dumped')

paragraphs_folder_path = root_folder_path if args[
    "embedding_paragraphs_path"] is None else os.path.join(
        root_folder_path, args["embedding_paragraphs_path"])
paragraph_embeddings = None
if os.path.exists(
        os.path.join(root_folder_path,
                     args['contextualized_paragraphs_embeddings_with_token'])):
    print('contextualized_paragraphs_embeddings_with_token found')
    paragraph_embeddings = UTIL.load_embeddings(
        os.path.join(root_folder_path,
                     args['contextualized_paragraphs_embeddings_with_token']))
else: