Ejemplo n.º 1
0
def main(model_dir,
         gold_corpus_file,
         predicted_corpus_file,
         install_dependencies=True):
    gold_corpus = Corpus.read_from_file(gold_corpus_file)

    dm_model = dm.load(model_dir, install_dependencies=install_dependencies)

    docs = []
    docs_failed = []

    for gold_doc in gold_corpus:
        text = gold_doc.text
        id_ = gold_doc.id

        request = dm.make_request(query=[text])
        response = dm_model.inference(request)
        result = response.data[0]

        doc = result.sequence
        doc.id = id_

        if result.is_failed:
            doc.extra_attr["is_failed"] = True
            doc.extra_attr["exec_msg"] = result.exec_msg
            docs_failed.append(doc)
        else:
            docs.append(doc)

    predicted_corpus = Corpus(docs + docs_failed)
    predicted_corpus.write_to_file(predicted_corpus_file)

    print(len(docs_failed), len(docs))
Ejemplo n.º 2
0
def main(
    corpus_file: str = None,
    datadir: str = None,
    test_size: Union[int, float] = 0.1,
    train_corpus: str = None,
    test_corpus: str = None,
):
    result_dir = Path(datadir)

    corpus = None

    if corpus_file:
        corpus = Corpus.read_from_file(corpus_file)

        train, test = corpus.train_test_split(test_size=test_size)
        train.write_to_file(result_dir / "train.conllx")
        test.write_to_file(result_dir / "test.conllx")
    else:
        train = Corpus.read_from_file(train_corpus)
        test = Corpus.read_from_file(test_corpus)
        docs = [doc for doc in train] + [doc for doc in test]
        corpus = Corpus(docs)

        shutil.copy(train_corpus, datadir)
        shutil.copy(test_corpus, datadir)

    entities = {span.entity for doc in corpus for span in doc.span_set}

    with open(result_dir / "entity.txt", "wt") as fd:
        fd.write("\n".join(entities))
Ejemplo n.º 3
0
    def _inference(self, model, input_data: list):
        '''
            调用模型对输入数据进行推理,即采用模型对输入数据做实体识别标注,
            并将推理结果保存至指定输出路径,推理结果供人工复查。
        :param model: 模型
        :param input_data: 语料数据
        :return:
        '''
        output = []
        batch_size = 1
        batches = MtModelInference_Deliverable.generate_batch_input(
            input_data, batch_size)
        for batch in batches:
            request = Request(batch)
            response = model.inference(request)
            tmp_result = response['data'][0].sequence
            tmp_result.label = response['cls'][0][0]
            output.append(tmp_result)

        predict_result = Corpus(output)
        predict_result.write_to_file(
            os.path.join(self.config['output_filepath'],
                         'inference_out.conllx'))

        print(
            '*** inference has been done, please check the result through the path below:'
        )
        print('==>{}'.format(self.config['output_filepath']))

        return
Ejemplo n.º 4
0
def create_new_corpus(data_dict, corpus_vol, **kwargs):
    new_corpus = Corpus([])
    sem_nums = kwargs['sem_nums']
    intents = data_dict.keys()
    if not corpus_vol:
        return
    elif sem_nums > len(intents):
        return
    else:
        for i in range(corpus_vol):
            intent_sam = set()
            while len(intent_sam) < sem_nums:
                intent_sam.add(random.choice(list(intents)))
            spanset = SpanSet()
            sentences = []
            start_position = 0
            for intent in list(intent_sam):
                if intent == 'noise':
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    start_position += len(txt)
                else:
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    spanset.append(
                        Span(start=start_position,
                             end=start_position + len(txt),
                             entity=intent))
                    start_position += len(txt)
            doc = Document(text=''.join(sentences),
                           label='|'.join(intent_sam),
                           span_set=spanset)
            new_corpus.append(doc)

    return new_corpus
Ejemplo n.º 5
0
def test_union(datadir):
    corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx")
    corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx")

    result = len(corpus_one.union(corpus_two))
    expected = 4

    assert result == expected
Ejemplo n.º 6
0
def test_symmetric_difference(datadir):
    corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx")
    corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx")

    result = len(corpus_one.symmetric_difference(corpus_two))
    expected = 2

    assert result == expected
Ejemplo n.º 7
0
def test_set_document_compare_way(datadir):
    corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx")
    corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx")

    assert corpus_one != corpus_two

    corpus_one.set_document_compare_way(DocumentCompareWays.TEXT_ONLY)
    corpus_two.set_document_compare_way(DocumentCompareWays.TEXT_ONLY)

    assert corpus_one == corpus_two
Ejemplo n.º 8
0
def test_contains__(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
    corpus.append(seq_two)

    assert seq_one in corpus

    other_corpus = Document("")

    assert other_corpus not in corpus
Ejemplo n.º 9
0
def test_corpus_diff(datadir):
    corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx")
    corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx")

    corpus_diff = CorpusDiff(corpus_one, corpus_two)
    corpus_diff_result = corpus_diff.compare()
    result = corpus_diff_result.render_to_md()
    expected = """# 3
- <D: None, F: None, S: None, I: None>    [王 小 明](PERSON) 在 [台 北 新 竹](GPE) 的 [清 华 大 学](ORG) 读 书 。
- <D: None, F: None, S: None, I: None>    [王 小 明](PERSON) 在 [台 北 新 竹](CITY) 的 [清 华 大 学](ORG) 读 书 。"""
    assert result == expected
Ejemplo n.º 10
0
def two_add_link(map_data, file1, file2, link, domain):
    list1 = read_raw_data(file1)
    list2 = read_raw_data(file2)
    link_list = read_raw_data(link)
    len_all = max(len(list1), len(list2))
    path1 = os.path.basename(file1)
    path2 = os.path.basename(file2)
    doc_list = []
    dict_list = read_map(map_data)
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(list2)
        l3 = choice(link_list)
        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)

        l = l1 + l3 + l2
        doc1 = Document(l)
        doc1.domain = domain
        doc1.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + l3),
                 end=len(l1 + l3 + l2end),
                 entity=path2[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

        ll = l2 + l3 + l1
        doc2 = Document(ll)
        doc2.domain = domain
        doc2.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list2 = [
            Span(start=0, end=len(l2end), entity=path2[:-4]),
            Span(start=len(l2 + l3),
                 end=len(l2 + l3 + l1end),
                 entity=path1[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        # print(doc1)
        doc_list.append(doc2)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + '-' + 'link' + ".conllx"
    corpus.write_to_file(res_path)
Ejemplo n.º 11
0
def main(gold: str, pred: str) -> dict:
    gold_corpus = Corpus.read_from_file(gold)
    pred_corpus = Corpus.read_from_file(pred)

    cm = CorpusMetric.create_from_corpus(gold_corpus, pred_corpus)

    return {
        "entity_f1_score": cm.entity_f1_score,
        "entity_accuracy_score": cm.entity_accuracy_score,
        "entity_precision_score": cm.entity_precision_score,
        "entity_recall_score": cm.entity_recall_score,
        "entity_classification_report": cm.entity_classification_report,
        "doc_entity_correctness": cm.doc_entity_correctness,
    }
Ejemplo n.º 12
0
def test_intersection(datadir):
    corpus = Corpus.read_from_file(datadir / "self.conllx")
    other_corpus = Corpus.read_from_file(datadir / "other.conllx")

    result = corpus.intersection(other_corpus)

    assert isinstance(result, Corpus)
    assert len(result) == 2

    second_corpus = Corpus.read_from_file(datadir / "second_other.conllx")
    result = corpus.intersection(other_corpus, second_corpus)

    assert isinstance(result, Corpus)
    assert len(result) == 1
Ejemplo n.º 13
0
def group_by_domain(input_file, output_dir):
    output_dir = Path(output_dir)

    corpus = Corpus.read_from_file(input_file)

    domain_doc = collections.defaultdict(list)
    for doc in corpus:
        domain_doc[doc.domain].append(doc)

    for domain, doc_list in domain_doc.items():
        output_file = output_dir / "{}.conllx".format(domain)

        corpus = Corpus(doc_list)
        corpus.write_to_file(output_file)
Ejemplo n.º 14
0
def test_getitem__(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
    corpus.append(seq_two)

    # test single element get item
    item = corpus[0]

    assert item == seq_one

    # test batch element get item
    other_corpus = corpus[[0, 1]]

    assert other_corpus == corpus
Ejemplo n.º 15
0
def read_raw_data(filepath):
    corpus_raw = Corpus.read_from_file(filepath)
    out_dict = defaultdict(set)
    for sam in corpus_raw:
        out_dict[sam.label].add(''.join(sam.text))

    return out_dict
Ejemplo n.º 16
0
 def __call__(self):
     corpus_test = Corpus.read_from_file(
         os.path.join(self.config['data_filepath'],
                      self.config['data_filename']))
     scores, differ_corpus_tuples = self._evaluation(corpus_test)
     self.save_result(scores, differ_corpus_tuples)
     print('Evaluation has been done.')
Ejemplo n.º 17
0
def test_express_pattern(datadir):
    corpus = Corpus.read_from_file(datadir / "corpus.conllx")

    express_pattern = ExpressPattern(corpus)
    result = express_pattern.compute()

    result_keys = [str(i) for i in result.keys()]

    expected_keys = ["<PERSON> 在 <GPE> 的 <ORG> 读 书 。", "来 一 首 <歌手名> 的 歌 。"]

    for r, e in zip(result_keys, expected_keys):
        assert e in r

    result_value = result.values()

    expected_value = [
        [
            "[王 小 明](PERSON) 在 [北 京](GPE) 的 [清 华 大 学](ORG) 读 书 。",
            "[王 小 明](PERSON) 在 [台 北 新 竹](GPE) 的 [清 华 大 学](ORG) 读 书 。",
        ],
        ["来 一 首 [蓝 泽 雨](歌手名) 的 歌 。"],
    ]

    for i, value in enumerate(result_value):
        for j, element in enumerate(value):
            assert expected_value[i][j] in str(element)
Ejemplo n.º 18
0
def to_conllx(file_prefix):
    base_name, _ = os.path.splitext(file_prefix)

    log_file = './data/error/{}.error'.format(base_name)

    with open('./data/raw/{}'.format(file_prefix)) as fd, open(log_file,
                                                               'wt') as logger:
        output_lines = []
        seq_list = []
        for raw_line in fd:
            line = raw_line.strip()
            if not line:
                continue
            try:
                seq, sentence = process_one_line(line, logger)
            except CheckFailedError as e:
                continue
            else:
                seq_list.append(seq)
                output_lines.append(sentence)

        # write_conll(output_lines, 'data/{}.text'.format(file_prefix))
        output_file = './data/domain/{}.conllx'.format(base_name)
        # with open('./data/domain/{}.conllx'.format(base_name), 'wt') as output_fd:
        # write_conllx(output_lines, output_fd)
        Corpus(seq_list).write_to_file(output_file)
Ejemplo n.º 19
0
def render(doc_pattern, dictionary: Dict[str, List[str]]):
    doc_list = []

    for pattern in doc_pattern:
        placeholder_names = [i.entity for i in pattern.get_placeholders()]
        pattern_specific_dictionary = {
            i: dictionary[i]
            for i in placeholder_names
        }

        instance_list_variable = list(
            itertools.product(*pattern_specific_dictionary.values()))
        # print(len(instance_list_variable))
        if len(instance_list_variable) > 200:
            random_instance_list_variable = sample(instance_list_variable, 200)
        else:
            random_instance_list_variable = instance_list_variable

        for instance_variable in random_instance_list_variable:
            instance_mapping = dict(
                zip(pattern_specific_dictionary.keys(), instance_variable))

            doc = pattern.render(instance_mapping)
            doc_list.append(doc)

    return Corpus(doc_list)
def test_eq__(datadir):
    corpus = Corpus.read_from_file(datadir / "data.conllx")

    corpus_statistics = CorpusStatistics.create_from_corpus(corpus)

    expected = CorpusStatistics(
        domain=Counter({"domain_one": 2, "domain_two": 2}),
        function=Counter({"function_one": 2, "function_two": 2}),
        sub_function=Counter({"sub_function_one": 2, "sub_function_two": 2}),
        intent=Counter({"intent_one": 2, "intent_two": 2}),
        entity_types={
            "PERSON": Counter({("王", "小", "明"): 2}),
            "GPE": Counter({("北", "京"): 2}),
            "ORG": Counter({("清", "华", "大", "学"): 2}),
            "歌手名": Counter({("蓝", "泽", "雨"): 2}),
        },
        entity_values={
            ("王", "小", "明"): Counter({"PERSON": 2}),
            ("北", "京"): Counter({"GPE": 2}),
            ("清", "华", "大", "学"): Counter({"ORG": 2}),
            ("蓝", "泽", "雨"): Counter({"歌手名": 2}),
        },
    )

    assert corpus_statistics == expected
Ejemplo n.º 21
0
def extract_part_seq(data, all_seq, part_seq):
    # read data
    corpus = Corpus.read_from_file(data)

    # generate all sequence pattern
    doc_pattern = corpus.generate_pattern()
    doc_pattern.write_to_file(all_seq)

    yy_list = []
    # extract part sequence pattern
    for doc in doc_pattern:
        for i in doc.text:
            if i == '关':
                yydoc = DocumentPattern(doc.text)
                # print(yydoc)
                yydoc.entities = doc.entities
                yydoc.domain = doc.domain
                yydoc.id = doc.id
                yy_list.append(yydoc)

    yy_list = set(yy_list)
    yyy_doc = CorpusPattern(yy_list)
    print(yyy_doc)
    yyy_doc.write_to_file(part_seq)
    return yyy_doc
Ejemplo n.º 22
0
def test_render(datadir):
    corpus = Corpus.read_from_file(datadir / "corpus.conllx")

    corpus_pattern = CorpusPattern.create_from_corpus(corpus)

    dictionary = {
        "PERSON": ["小王", "小李"],
        "GPE": ["北京"],
        "ORG": ["师范大学", "专科学校"],
        "歌手名": ["周杰伦", "孙燕姿"]
    }

    generated_corpus = corpus_pattern.render(dictionary)

    expected = sorted([
        "[小 王](PERSON) 在 [北 京](GPE) 的 [师 范 大 学](ORG) 读 书 。",
        "[小 王](PERSON) 在 [北 京](GPE) 的 [专 科 学 校](ORG) 读 书 。",
        "[小 李](PERSON) 在 [北 京](GPE) 的 [师 范 大 学](ORG) 读 书 。",
        "[小 李](PERSON) 在 [北 京](GPE) 的 [专 科 学 校](ORG) 读 书 。",
        "来 一 首 [周 杰 伦](歌手名) 的 歌 。", "来 一 首 [孙 燕 姿](歌手名) 的 歌 。"
    ])

    result = sorted([str(i) for i in generated_corpus])

    for e, r in zip(expected, result):
        assert e in r
Ejemplo n.º 23
0
def test_attr_access(datadir):
    corpus = Corpus.read_from_file(datadir / "corpus.conllx")
    doc = corpus[0]

    assert doc.domain == "domain"
    assert doc.function == "function"
    assert doc.intent == "intent"
    assert doc.sub_function == "sub_function"
Ejemplo n.º 24
0
def test_fuzzy_search(datadir):
    corpus = Corpus.read_from_file(datadir / "output.conllx")

    result = corpus.fuzzy_search("北京 读书", limit=1)

    expected = seq_one

    assert result[0][0] == expected
Ejemplo n.º 25
0
def test_difference(datadir):
    corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx")
    corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx")

    result = corpus_one.difference(corpus_two)
    expected = Corpus([
        Document(
            "王小明在台北新竹的清华大学读书。",
            span_set=SpanSet(
                [Span(0, 3, "PERSON"),
                 Span(4, 8, "GPE"),
                 Span(9, 13, "ORG")]),
            id="3",
        )
    ])

    assert result == expected
Ejemplo n.º 26
0
def test_remove_duplicate(datadir):
    corpus = Corpus.read_from_file(datadir / "duplicate.conllx")

    assert len(corpus) == 4

    duplicate_free = corpus.remove_duplicate()

    assert isinstance(duplicate_free, Corpus)
    assert len(duplicate_free) == 2
def generate_constraint_to_file(input_file: str,
                                output_file: str,
                                output_attr: str = "label"):
    corpus = Corpus.read_from_file(input_file)

    domain_mapping = generate_constraint(corpus, output_attr)

    with open(output_file, "wt") as fd:
        json.dump(domain_mapping, fd, indent=4, ensure_ascii=False)
def test_collect_domain(datadir):
    corpus = Corpus.read_from_file(datadir / "data.conllx")

    corpus_statistics = CorpusStatistics.create_from_corpus(corpus)

    result = corpus_statistics.domain

    expected = Counter({"domain_one": 2, "domain_two": 2})

    assert result == expected
def test_collect_intent(datadir):
    corpus = Corpus.read_from_file(datadir / "data.conllx")

    corpus_statistics = CorpusStatistics.create_from_corpus(corpus)

    result = corpus_statistics.intent

    expected = Counter({'intent_one': 2, 'intent_two': 2})

    assert result == expected
def test_collect_sub_function(datadir):
    corpus = Corpus.read_from_file(datadir / "data.conllx")

    corpus_statistics = CorpusStatistics.create_from_corpus(corpus)

    result = corpus_statistics.sub_function

    expected = Counter({'sub_function_one': 2, 'sub_function_two': 2})

    assert result == expected