Python convert_to_unicode Exemples, bert_model.tokenization.convert_to_unicode Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : run_similarity.py Projet : lixuanhng/NLP_related_projects

 def get_sentence_examples(self, questions):
     for index, data in enumerate(questions):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(data[0]))
         text_b = tokenization.convert_to_unicode(str(data[1]))
         label = str(0)
         yield InputExample(guid=guid,
                            text_a=text_a,
                            text_b=text_b,
                            label=label)

Exemple #2

0

Afficher le fichier

Fichier : data_process.py Projet : tpoljak/BERT_RESSEL

 def _create_examples(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     print("%d %s data creation is finished!" % (len(examples), set_type))
     return examples

Exemple #3

0

Afficher le fichier

Fichier : run_similarity.py Projet : lixuanhng/NLP_related_projects

 def get_dev_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'dev.csv')
     dev_df = pd.read_csv(file_path, encoding='utf-8')
     dev_data = []
     for index, dev in enumerate(dev_df.values):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(dev[0]))
         text_b = tokenization.convert_to_unicode(str(dev[1]))
         label = str(dev[2])
         dev_data.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return dev_data

Exemple #4

0

Afficher le fichier

Fichier : data_process.py Projet : tpoljak/BERT_RESSEL

def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with tf.gfile.GFile(input_file, "r") as reader:
        while True:
            line = tokenization.convert_to_unicode(reader.readline())
            if not line:
                break
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a,
                             text_b=text_b))
            unique_id += 1
        print("num_examples", len(examples))

    return examples

Exemple #5

0

Afficher le fichier

    def get_sentence_statistics(self):
        """Creates examples for the training and dev sets."""
        dialog_examples = []
        response_examples = []

        for (i, dialog_data) in enumerate(self.data_l):
            text_dilaog_context = tokenization.convert_to_unicode(
                dialog_data[0])
            text_response = tokenization.convert_to_unicode(dialog_data[1])
            label = tokenization.convert_to_unicode(dialog_data[2])

            tokens_dialog_context = self.tokenizer.tokenize(
                text_dilaog_context)
            tokens_response = self.tokenizer.tokenize(text_response)

            tok_dialog_len = len(tokens_dialog_context)
            tok_response_len = len(tokens_response)
            # print(i, ":", tok_dialog_len)

            self.avg_dialog_context += tok_dialog_len
            self.avg_response += tok_response_len

            if self.max_dialog_context < tok_dialog_len:
                self.max_dialog_context = tok_dialog_len

            if self.max_response < tok_response_len:
                self.max_response = tok_response_len

            if self.min_dialog_context > tok_dialog_len:
                self.min_dialog_context = tok_dialog_len

            if self.min_response > tok_response_len:
                self.min_response = tok_response_len

            if (i + 1) % 1000 == 0:
                print(i + 1, "th text stat info")
                print("avg_dialog_context", self.avg_dialog_context / (i + 1))
                print("avg_response", self.avg_response / (i + 1))

                print("max_dialog_context", self.max_dialog_context)
                print("max_response", self.max_response)

                print("min_dialog_context", self.min_dialog_context)
                print("min_response", self.min_response)
                print('-' * 200)
        self.avg_dialog_context /= len(self.data_l)
        self.avg_response /= len(self.data_l)

Exemple #6

0

Afficher le fichier

def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    index = 0
    for input_file in input_files:
        with tf.gfile.GFile(input_file, "r") as reader:
            while True:
                if (index + 1) % 100000 == 0:
                    tf.logging.info("%d data has been tokenized!" %
                                    (index + 1))
                index += 1
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                # print(tokens)
                # if index == 100:
                #   exit()
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    tf.logging.info("total %d data tokenizing process has been finished!" %
                    len(all_documents))
    print(all_documents[0])

    rng.shuffle(all_documents)
    tokenized_data_path = "/mnt/raid5/taesun/data/ResSel/advising/tokenized_train_data_320_eot.pkl"
    with open(tokenized_data_path, "wb") as document_f_handle:
        pickle.dump(all_documents, document_f_handle)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, rng))

    rng.shuffle(instances)
    return instances

Exemple #7

0

Afficher le fichier

Fichier : data_process.py Projet : tpoljak/BERT_RESSEL

    def _create_seperate_examples(self, inputs, set_type):
        dialog_examples = []
        response_examples = []

        for (i, dialog_data) in enumerate(inputs):
            guid = "%s-%d" % (set_type, i + 1)
            dialog = tokenization.convert_to_unicode(dialog_data[0])
            response = tokenization.convert_to_unicode(dialog_data[1])
            label = tokenization.convert_to_unicode(dialog_data[2])
            dialog_examples.append(
                InputExample(guid=guid, text_a=dialog, label=label))
            response_examples.append(
                InputExample(guid=guid, text_a=response, label=label))

        print("%s data creation is finished! %d" %
              (set_type, len(dialog_examples)))

        return dialog_examples, response_examples

Exemple #8

0

Afficher le fichier

Fichier : data_process.py Projet : tpoljak/BERT_RESSEL

    def _create_examples(self, inputs, set_type):
        """Creates examples for the training and dev sets."""
        examples = []

        for (i, dialog_data) in enumerate(inputs):

            guid = "%s-%d" % (set_type, i + 1)
            text_a = tokenization.convert_to_unicode(dialog_data[0])
            text_b = tokenization.convert_to_unicode(dialog_data[1])
            label = tokenization.convert_to_unicode(str(dialog_data[2]))
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        print("%s data creation is finished! %d" % (set_type, len(examples)))

        return examples

Exemple #9

0

Afficher le fichier

Fichier : data_process.py Projet : tpoljak/BERT_RESSEL

    def _create_knowledge_examples(self, inputs, set_type):

        examples = []
        for (i, dialog_data) in enumerate(inputs):
            guid = "%s-%d" % (set_type, i + 1)
            text_a = tokenization.convert_to_unicode(dialog_data[3])

            examples.append(InputExample(guid=guid, text_a=text_a))
        print("%s data creation is finished! %d" % (set_type, len(examples)))
        return examples

Exemple #10

0

Afficher le fichier

Fichier : data_process.py Projet : tpoljak/BERT_RESSEL

    def _create_knowledge_separate_examples(self, inputs, set_type):
        knowledge_num = self.hparams.top_n
        examples = []
        for (i, dialog_data) in enumerate(inputs):
            separate_examples = []
            text_b = tokenization.convert_to_unicode(dialog_data[1])
            for j, (knowledge, knowledge_label) in enumerate(
                    zip(dialog_data[3], dialog_data[4])):
                guid = "%s-%d-%d" % (set_type, i + 1, j + 1)
                text_a = tokenization.convert_to_unicode(knowledge)
                label = tokenization.convert_to_unicode(str(knowledge_label))
                separate_examples.append(
                    InputExample(guid=guid,
                                 text_a=text_a,
                                 text_b=text_b,
                                 label=label))
            examples.append(separate_examples)

        print("%s data creation is finished! %d" % (set_type, len(examples)))
        return examples

Exemple #11

0

Afficher le fichier

Fichier : data_process.py Projet : tpoljak/BERT_RESSEL

    def _create_similar_examples(self, inputs, set_type):
        examples = []

        for (i, dialog_data) in enumerate(inputs):
            sub_examples = []
            for sub_i, each_similar_dialog in enumerate(dialog_data[3]):
                guid = "%s-%d-%d" % (set_type, i + 1, sub_i + 1)
                text_a = tokenization.convert_to_unicode(each_similar_dialog)
                sub_examples.append(InputExample(guid=guid, text_a=text_a))
            examples.append(sub_examples)
        print("%s data creation is finished! %d" % (set_type, len(examples)))
        return examples

Exemple #12

0

Afficher le fichier

Fichier : pretrained_knowledge.py Projet : tpoljak/BERT_RESSEL

        def _create_knowledge_examples(ubuntu_manual_dict):
            examples = []
            knowledge_name_list = []
            for i, knowledge_name in enumerate(ubuntu_manual_dict.keys()):
                guid = "knowledge-%d" % (i + 1)
                text_a = tokenization.convert_to_unicode(
                    knowledge_name + " : " +
                    ubuntu_manual_dict[knowledge_name])
                knowledge_name_list.append(knowledge_name)
                examples.append(InputExample(guid=guid, text_a=text_a))
            print("knowledge description data creation is finished! %d" %
                  (len(examples)))

            return examples, knowledge_name_list

Exemple #13

0

Afficher le fichier

Fichier : similar_dialog_process_bert.py Projet : tpoljak/BERT_RESSEL

        def _create_train_data_examples(ubuntu_data):
            examples = []
            count = 0
            print("total length of ubuntu dialog data")
            for i, dialog_data in enumerate(ubuntu_data):
                if i % 2 != 0:
                    continue
                count += 1
                guid = "train_dialog-%d" % count
                text_a = tokenization.convert_to_unicode(dialog_data[0])
                examples.append(InputExample(guid=guid, text_a=text_a))
            print("knowledge description data creation is finished! %d" %
                  (len(examples)))

            return examples

Exemple #14

0

Afficher le fichier

    def quantity_analysis_run_evaluate(self):

        examples = [
            "have you tried the regular way to start with do you run ubuntu with gnome",
            "probably ca n't do that on the xbox can you get a command prompt on it at all try rhythmbox",
            " afroken check out xrandr as the control mechanism for video ..",
            "certainly sudo apt-get install zsh zsh-doc zsh-lovers"
        ]

        self._logger.info("Evaluation batch iteration per epoch is %d" %
                          len(examples))
        if self.train_setup_vars["do_evaluate"]:
            print("Evaluation batch iteration per epoch is %d" % len(examples))
        dialog_examples = []
        for (i, dialog_data) in enumerate(examples):
            text_a = tokenization.convert_to_unicode(dialog_data)
            dialog_examples.append(InputExample(guid=i, text_a=text_a))
        feed_dict = {}
        [input_ids, input_mask, segment_ids, (text_a_lengths, text_b_lengths), label_ids, position_ids] = \
         self.processor.get_analysis_bert_data(dialog_examples)

        feed_dict[self.input_ids_ph] = input_ids
        feed_dict[self.input_mask_ph] = input_mask
        feed_dict[self.segment_ids_ph] = segment_ids
        feed_dict[self.dropout_keep_prob_ph] = 1.0

        feed_dict[self.dialog_position_ids_ph] = position_ids
        feed_dict[self.dialog_len_ph] = text_a_lengths
        feed_dict[self.response_len_ph] = text_b_lengths
        feed_dict[self.label_ids_ph] = label_ids

        sequence_out_val = self.sess.run([self.seq_outputs],
                                         feed_dict=feed_dict)  # batch, 768 * 4
        print(np.array(sequence_out_val).shape)
        seq_merged_embeddings = merge_subtokens(examples, self.tokenizer,
                                                sequence_out_val[0])