Ejemplo n.º 1
1
 def upload_task(self, passage):
     passage_out = self.create_passage(text=to_text(passage,
                                                    sentences=False)[0],
                                       type="PUBLIC",
                                       source=self.source)
     task_in = dict(type="TOKENIZATION",
                    status="SUBMITTED",
                    project=self.project,
                    user=self.user,
                    passage=passage_out,
                    manager_comment=passage.ID,
                    user_comment=passage.ID,
                    parent=None,
                    is_demo=False,
                    is_active=True)
     tok_task_out = self.create_tokenization_task(**task_in)
     tok_user_task_in = dict(tok_task_out)
     tok_user_task_in.update(
         to_json(passage, return_dict=True, tok_task=True))
     tok_user_task_out = self.submit_tokenization_task(**tok_user_task_in)
     task_in.update(parent=tok_task_out, type="ANNOTATION")
     ann_user_task_in = self.create_annotation_task(**task_in)
     ann_user_task_in.update(
         to_json(passage,
                 return_dict=True,
                 tok_task=tok_user_task_out,
                 all_categories=self.layer["categories"]))
     return self.submit_annotation_task(**ann_user_task_in)
Ejemplo n.º 2
1
 def __init__(self, enabled=True):
     self.address = os.environ.get("SPOTLIGHT_ADDRESS", "http://model.dbpedia-spotlight.org/en/annotate")
     self.confidence = float(os.environ.get("SPOTLIGHT_CONFIDENCE", 0.3))
     self.text = None
     self.spots = ()
     self.passage_texts = keydefaultdict(lambda passage: to_text(passage, sentences=False)[0])
     self.enabled = enabled
Ejemplo n.º 3
0
def test_parser(config, model_type, formats, default_setting, text=True):
    filename = "test_files/models/%s_%s%s" % ("_".join(formats), model_type,
                                              default_setting.suffix())
    remove_existing(filename)
    config.update(default_setting.dict())
    scores = []
    params = []
    passages = list(map(load_passage, passage_files(*formats)))
    evaluate = ("amr" not in formats)
    for mode in "train", "load":
        print("-- %sing %s" % (mode, model_type))
        config.update(dict(classifier=model_type, copy_shared=None))
        p = Parser(model_files=filename, config=config)
        p.save_init = True
        list(
            p.train(passages if mode == "train" else None,
                    dev=passages,
                    test=True,
                    iterations=2))
        assert p.model.is_finalized, "Model should be finalized after %sing" % mode
        assert not getattr(p.model.feature_extractor, "node_dropout",
                           0), p.model.feature_extractor.node_dropout
        all_params = p.model.all_params()
        params.append(all_params)
        param1, param2 = [
            d.get("W") for d in (all_params, p.model.feature_extractor.params)
        ]
        if param1 is not None and param2 and param2.init is not None and not config.args.update_word_vectors:
            assert_allclose(param1,
                            weight_decay(p.model) * param2.init,
                            rtol=1e-6)
        text_results = results = list(p.parse(passages, evaluate=evaluate))
        if text:
            print("Converting to text and parsing...")
            text_results = list(
                p.parse([
                    p3 for p1 in passages
                    for p2 in convert.to_text(p1, sentences=False)
                    for p3 in convert.from_text(
                        p2, p1.ID, extra_format=p1.extra.get("format"))
                ]))
            assert len(results) == len(text_results)
        if evaluate:
            scores.append(Scores(tuple(zip(*results))[1]).average_f1())
            if text:
                for t, (r, s) in zip(text_results, results):
                    print("  %s F1=%.3f" % (r.ID, s.average_f1()))
        assert not list(p.parse(()))  # parsing nothing returns nothing
        print()
    assert_all_params_equal(*params)
    if evaluate:
        print("-- average f1: %.3f, %.3f\n" % tuple(scores))
        assert scores[0] == pytest.approx(scores[1], 0.1)
Ejemplo n.º 4
0
 def upload_task(self,
                 passage,
                 log=None,
                 submit=True,
                 ids=None,
                 upload=True):
     if ids:
         passage_id, tok_id, ann_id = ids[passage.ID]
         passage_out = self.get_passage(passage_id)
         tok_user_task_out = tok_task_out = self.get_user_task(tok_id)
         ann_user_task_in = self.get_user_task(ann_id)
     else:
         passage_out = self.create_passage(
             text=to_text(passage, sentences=False)[0],
             type="PUBLIC",
             source=self.source,
             external_id=passage.ID) if upload else passage
         task_in = dict(type="TOKENIZATION",
                        status="ONGOING",
                        project=self.project,
                        user=self.user,
                        passage=passage_out,
                        manager_comment=passage.ID,
                        user_comment=passage.ID,
                        parent=None,
                        is_demo=False,
                        is_active=True)
         tok_task_out = self.create_task(**task_in) if upload else task_in
         tok_user_task_in = dict(tok_task_out)
         tok_user_task_in.update(
             to_json(passage, return_dict=True, tok_task=True))
         tok_user_task_out = self.submit_task(
             **tok_user_task_in) if upload else tok_user_task_in
         task_in.update(parent=tok_task_out, type="ANNOTATION")
         ann_user_task_in = self.create_task(
             **task_in) if upload else task_in
     ann_user_task_in.update(
         to_json(passage,
                 return_dict=True,
                 tok_task=tok_user_task_out,
                 all_categories=self.layer["categories"]))
     ann_user_task_out = self.submit_task(
         **ann_user_task_in, submit=submit) if upload else ann_user_task_in
     if log:
         print(passage.ID,
               passage_out["id"],
               tok_task_out["id"],
               ann_user_task_out["id"],
               file=log,
               sep="\t",
               flush=True)
     return ann_user_task_out
Ejemplo n.º 5
0
 def upload_task(self, passage):
     passage_out = self.create_passage(text=to_text(passage, sentences=False)[0], type="PUBLIC", source=self.source)
     task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user,
                    passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None,
                    is_demo=False, is_active=True)
     tok_task_out = self.create_tokenization_task(**task_in)
     tok_user_task_in = dict(tok_task_out)
     tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))
     tok_user_task_out = self.submit_tokenization_task(**tok_user_task_in)
     task_in.update(parent=tok_task_out, type="ANNOTATION")
     ann_user_task_in = self.create_annotation_task(**task_in)
     ann_user_task_in.update(
         to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"]))
     return self.submit_annotation_task(**ann_user_task_in)
Ejemplo n.º 6
0
 def train_test(self, model_type, compare=True):
     scores = []
     p = None
     for mode in "train", "load":
         print("-- %sing %s" % (mode, model_type))
         p = Parser(model_file="test_files/models/%s" % model_type, model_type=model_type)
         p.train(self.load_passages() if mode == "train" else None, iterations=200)
         score = evaluation.Scores.aggregate([s for _, s in p.parse(self.load_passages(), evaluate=True)])
         scores.append(score.average_f1())
         print()
     print("-- average labeled f1: %.3f, %.3f\n" % tuple(scores))
     if compare:
         self.assertAlmostEqual(*scores)
     p.parse(convert.to_text(self.load_passages()[0]))
     self.assertFalse(list(p.parse(())))  # parsing nothing returns nothing
Ejemplo n.º 7
0
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        t = split2sentences(passage)
        i = 0
        for sen in t:
            #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen)))
            print('sentence %d\n\n%s\n' % (i, convert.to_text(sen)))
            i += 1
            compunds = []
            for node in sen.nodes:
                if (sen.nodes[node].layer.ID == '0'):
                    find_id = ''
                    l = sen.nodes[node]
                    if (l.parents[0].ftag == 'C'):
                        if (l.parents[0].ID not in compunds):
                            compunds.append(l.parents[0].ID)
                            tmp_c = []
                            for n in l.parents[0].children:
                                tmp_c.append(n.text)
                            #print('Word: %s\nWord ID: %s' %(tmp_c,l.parents[0].ID))
                            find_id = l.parents[0].ID
                            path = []
                            path.append(' '.join(tmp_c))
                            path = find_path(sen.nodes[find_id], path)
                            print(' '.join(path))
                            '''
                            for j in path:
                                print(j)
                            '''
                            print('-------')

                    else:
                        #print('Word: %s\nWord ID: %s' % (l.text, l.ID))
                        find_id = l.ID
                        path = []
                        path = find_path(sen.nodes[find_id], path)
                        print(' '.join(path))
                        '''
                        for j in path:
                            print(j)
                        '''
                        print('-------')
            print(
                '------------------------------------------------------------------'
            )
Ejemplo n.º 8
0
def show_query(fnode):
    print('\n'.join(convert.to_text(fnode.root)))
    print(str(fnode.get_top_scene()))
    print(str(fnode))
Ejemplo n.º 9
0
def write_text(passage, f, sentences, lang, prepend_id=False):
    for line in to_text(passage, sentences=sentences, lang=lang):
        fields = [passage.ID, line] if prepend_id else [line]
        print(*fields, file=f, sep="\t")
Ejemplo n.º 10
0
 def test_to_text(self):
     passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml"))
     self.assertEqual(convert.to_text(passage, False)[0],
                      "1 2 3 4 . 6 7 8 9 10 . 12 13 14 15")
     self.assertSequenceEqual(convert.to_text(passage, True),
                              ["1 2 3 4 .", "6 7 8 9 10 .", "12 13 14 15"])
Ejemplo n.º 11
0
def test_to_text():
    passage = loaded()
    assert convert.to_text(passage,
                           False)[0] == "1 2 3 4 . 6 7 8 9 10 . 12 13 14 15"
    assert convert.to_text(
        passage, True) == ["1 2 3 4 .", "6 7 8 9 10 .", "12 13 14 15"]
Ejemplo n.º 12
0
def show_query(fnode):
    print('\n'.join(convert.to_text(fnode.root)))
    print(str(fnode.get_top_scene()))
    print(str(fnode))
Ejemplo n.º 13
0
 def test_to_text(self):
     passage = convert.from_standard(self._load_xml('./standard3.xml'))
     self.assertEqual(convert.to_text(passage, False)[0],
                      '1 2 3 4 . 6 7 8 9 10 . 12 13 14 15')
     self.assertSequenceEqual(convert.to_text(passage, True),
                              ['1 2 3 4 .', '6 7 8 9 10 .', '12 13 14 15'])
Ejemplo n.º 14
0
def main(args):
    with open(args.outfile, "w", encoding="utf-8") as f:
        for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting to text"):
            for line in to_text(passage, lang=args.lang):
                print(line, file=f)
    print("Wrote '%s'." % args.outfile)
Ejemplo n.º 15
0
def write_text(passage, f, lang):
    for line in to_text(passage, lang=lang):
        print(line, file=f)
Ejemplo n.º 16
0
def write_text(passage, f, sentences, lang, prepend_id=False):
    for line in to_text(passage, sentences=sentences, lang=lang):
        fields = [passage.ID, line] if prepend_id else [line]
        print(*fields, file=f, sep="\t")
Ejemplo n.º 17
-1
 def test_to_text(self):
     passage = convert.from_standard(
         TestUtil.load_xml('test_files/standard3.xml'))
     self.assertEqual(
         convert.to_text(passage, False)[0],
         '1 2 3 4 . 6 7 8 9 10 . 12 13 14 15')
     self.assertSequenceEqual(convert.to_text(passage, True),
                              ['1 2 3 4 .', '6 7 8 9 10 .', '12 13 14 15'])
Ejemplo n.º 18
-1
 def test_to_text(self):
     passage = convert.from_standard(
         TestUtil.load_xml("test_files/standard3.xml"))
     self.assertEqual(
         convert.to_text(passage, False)[0],
         "1 2 3 4 . 6 7 8 9 10 . 12 13 14 15")
     self.assertSequenceEqual(convert.to_text(passage, True),
                              ["1 2 3 4 .", "6 7 8 9 10 .", "12 13 14 15"])
Ejemplo n.º 19
-1
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        #passage = convert.from_standard(elem)
        #print("Linearised\n------\n")
        # print(convert.to_sequence(passage))
        words = {}
        xmltoconll(passage)
        t = split2sentences(passage)
        i = 0
        for sen in t:
            print('sentence %d\n\n%s\n' % (i, convert.to_text(sen)))
            i += 1

        while (1):
            word = input('\nType the word below\n\n')
            for node in passage.nodes:
                t = passage.nodes[node]
                if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)):
                    #print('Word: %s\nWord ID: %s' %(t.text,t.ID))
                    #ans = input('\nDo you want to continue with wordi Id : %s', t.ID)
                    path = []
                    path = find_path(passage.nodes[t.ID], path)
                    break
            print(' '.join(path))
Ejemplo n.º 20
-1
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        t = split2sentences(passage)
        sen_no = 0
        for sen in t:
            #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen)))
            print('sentence %d\n\n%s\n' % (sen_no, convert.to_text(sen)))

            root = sen.nodes['1.1']
            first = 1
            tab_len = {}
            tab_len[0] = len('1.1')
            for i in root.children:
                print('\n')
                path = []
                level = 1
                path.append((i.ftag, i.ID, level, False))
                path = find_children(i, path, level)
                end = 0
                if (first):
                    pstr = root.ID
                    first = 0
                else:
                    for k in range(0, tab_len[0]):
                        pstr = pstr + ' '
                for j in path:
                    if (j == 'End'):
                        print(pstr)
                        pstr = ''
                        end = 1
                        continue
                    rel = j[0]
                    nd = j[1]
                    tab = int(j[2])
                    remote = j[3]
                    if (end):
                        q_mark = 0
                        for k in range(0, tab_len[tab - 1]):
                            if (k == tab_len[q_mark]):
                                pstr = pstr + '.'
                                q_mark += 1
                            else:
                                pstr = pstr + ' '
                            end = 0
                    if (rel in descr):
                        rel_desc = rel + ':' + descr[rel]
                    else:
                        rel_desc = rel
                    if (remote):
                        pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd
                    else:
                        pstr = pstr + '|-->(' + rel_desc + ')-->' + nd
                    tab_len[tab] = len(pstr)

            print('-----------------------------------\n')
            sen_no += 1