Ejemplo n.º 1
0
def sentence_to_idx(sentence, embedding):
    chars_list = []
    tokens = JiebaTokenizer(embedding)
    word_list = tokens.cut(sentence)
    for word in word_list:
        tp_w = get_idx_from_word(word, embedding.vocab.token_to_idx,
                                 embedding.vocab.unk_token)
        tp_list = [
            get_idx_from_word(ch, embedding.vocab.token_to_idx,
                              embedding.vocab.unk_token) for ch in list(word)
        ]
        chars_list.append({tp_w: tp_list})
    return chars_list
Ejemplo n.º 2
0
 def get_tokenizer(self, *args, **kwargs):
     """
     Get tokenizer of embedding module
     """
     if self.embedding_name.endswith('.en'):  # English
         raise NotImplementedError  # TODO: (chenxiaojie) add tokenizer of English embedding
     else:  # Chinese
         return JiebaTokenizer(self.vocab)
Ejemplo n.º 3
0
def process_data(loadfile, savefile, vocab):
    print(type(vocab))
    tokens = JiebaTokenizer(vocab)
    with open(loadfile, mode="r", encoding="utf8") as rfp:
        input_data = json.load(rfp)["data"]
    new_examples = []
    logger.info("Processing dataset %s." % loadfile)
    for entry in input_data:
        for paragraph in tqdm(entry["paragraphs"], desc="process"):
            title = paragraph["title"].strip()
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                qas_id = qa['id']
                question = qa["question"].strip()
                tmp_dict = {}
                tmp_dict['qas_id'] = qas_id

                tmp_dict['question_w'] = word_to_idx(tokens.cut(question),
                                                     vocab)
                tmp_dict['context_w'] = word_to_idx(tokens.cut(context), vocab)
                tmp_dict['title_w'] = word_to_idx(tokens.cut(title), vocab)
                tmp_dict['question_c'] = chars_to_idx(question, vocab)
                tmp_dict['context_c'] = chars_to_idx(context, vocab)
                tmp_dict['title_c'] = chars_to_idx(title, vocab)
                tmp_dict['is_impossible'] = 1 if qa["is_impossible"] else 0
                length = len(tmp_dict['context_c'])
                for item in qa['answers']:
                    answer_start = int(item["answer_start"])
                    answer = item["text"].strip()
                    if answer_start == -1:
                        label = random.randint(0, length)
                        tmp_dict['start_positions'] = label
                        tmp_dict["end_positions"] = label
                    else:
                        # Start/end character index of the answer in the text.
                        start_char = answer_start
                        end_char = start_char + len(answer)
                        tmp_dict["start_positions"] = start_char
                        tmp_dict["end_positions"] = end_char
                    new_examples.append(tmp_dict)
    with open(savefile, mode="w", encoding="utf-8") as wfp:
        json.dump(new_examples, wfp)
    logger.info("Saved the processed dataset %s." % savefile)
    return new_examples
Ejemplo n.º 4
0
class TestJiebaTokenizer(CpuCommonTest):
    def setUp(self):
        test_data_file = create_test_data(__file__)
        self.vocab = Vocab.load_vocabulary(test_data_file, unk_token='[UNK]')
        self.tokenizer = JiebaTokenizer(self.vocab)

    def test_jieba(self):
        text = "一万一"
        token_arr = self.tokenizer.cut(text)
        idx_arr = self.tokenizer.encode(text)
        for i, token in enumerate(token_arr):
            self.check_output_equal(self.vocab(token), idx_arr[i])

        jieba_tokenizer = self.tokenizer.get_tokenizer()
        jieba_token_arr = jieba_tokenizer.lcut(text, False, True)
        self.check_output_equal(token_arr, jieba_token_arr)

    def test_unk(self):
        text = "中国"
        idx_arr = self.tokenizer.encode(text)
        self.check_output_equal(self.vocab[self.vocab.unk_token] in idx_arr,
                                True)
Ejemplo n.º 5
0
            print(probs)
            idx = np.argmax(probs, axis=1)
            idx = idx.tolist()
            labels = [label_map[i] for i in idx]
            results.extend(labels)
        return results


if __name__ == "__main__":
    # Define predictor to do prediction.
    predictor = Predictor(args.model_file, args.params_file, args.device,
                          args.max_seq_length)

    # Firstly pre-processing prediction data  and then do predict.
    data = [
        '非常不错,服务很好,位于市中心区,交通方便,不过价格也高!',
        '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片',
        '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。',
    ]
    vocab = Vocab.from_json(args.vocab_path)
    tokenizer = JiebaTokenizer(vocab)
    label_map = {0: 'negative', 1: 'positive'}

    results = predictor.predict(data,
                                tokenizer,
                                label_map,
                                batch_size=args.batch_size,
                                network=args.network)
    for idx, text in enumerate(data):
        print('Data: {} \t Label: {}'.format(text, results[idx]))
Ejemplo n.º 6
0
def set_tokenizer(vocab):
    global tokenizer
    if vocab is not None:
        tokenizer = JiebaTokenizer(vocab=vocab)
Ejemplo n.º 7
0
 def setUp(self):
     test_data_file = create_test_data(__file__)
     self.vocab = Vocab.load_vocabulary(test_data_file, unk_token='[UNK]')
     self.tokenizer = JiebaTokenizer(self.vocab)