Ejemplo n.º 1
0
def main(args):
    if args.out_dir:
        os.makedirs(args.out_dir, exist_ok=True)
        if not args.tikz:
            import matplotlib
            matplotlib.use('Agg')
    to_stdout = (args.tikz or args.standoff) and not args.out_dir
    t = args.passages
    t = get_passages(t) if to_stdout else get_passages_with_progress_bar(
        t, desc="Visualizing")
    if args.sentences:
        t = (sentence for passage in t
             for sentence in split2sentences(passage))
    for passage in t:
        if args.tikz:
            print_text(args, visualization.tikz(passage),
                       passage.ID + ".tikz.txt")
        elif args.standoff:
            print_text(args, visualization.standoff(passage),
                       passage.ID + ".ann")
        else:
            import matplotlib.pyplot as plt
            width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27
            plt.figure(passage.ID, figsize=(width, width * 10 / 19))
            visualization.draw(passage, node_ids=args.node_ids)
            if args.out_dir:
                plt.savefig(
                    os.path.join(args.out_dir, passage.ID + "." + args.format))
                plt.close()
            else:
                plt.show()
Ejemplo n.º 2
0
 def test_split(self):
     """Test that splitting a single-sentence SDP graph converted to UCCA returns the same SDP graph"""
     for passage, ref, _ in read_test_sdp():
         sentences = split2sentences(passage)
         self.assertEqual(len(sentences), 1, "Should be one sentence: %s" % passage)
         sentence = sentences[0]
         self.convert_and_evaluate(sentence, ref)
Ejemplo n.º 3
0
def filter_nodes(categories=(),
                 tokens=(),
                 tokens_mode=CONSECUTIVE,
                 case_insensitive=False,
                 comment=False,
                 sentence_level=False,
                 **kwargs):
    for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(
            **kwargs):
        for node in [p.layer(layer1.LAYER_ID).heads[0] for p in convert.split2sentences(passage)] if sentence_level \
                else passage.layer(layer1.LAYER_ID).all:
            if comment and node.extra.get("remarks"):
                yield "comment", node, task_id, user_id
            if tokens and not node.attrib.get("implicit"):
                unit_tokens = [t.text for t in node.get_terminals(punct=True)]
                if case_insensitive:
                    unit_tokens = [x.lower() for x in unit_tokens]
                    tokens = [x.lower() for x in tokens]
                if tokens_match(unit_tokens, tokens, tokens_mode):
                    yield 'TOKENS', node, task_id, user_id
            elif categories:
                intersection = set(categories).intersection(
                    c.tag for e in node for c in e.categories)
                if intersection:
                    yield str(intersection), node, task_id, user_id
Ejemplo n.º 4
0
def main(args):
    splitter = Splitter.read_file(args.sentences,
                                  enum=args.enumerate,
                                  suffix_format=args.suffix_format,
                                  suffix_start=args.suffix_start)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(
                passage) if splitter else split2sentences(
                    passage,
                    remarks=args.remarks,
                    lang=args.lang,
                    ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(
                args.outdir, args.prefix + sentence.ID +
                (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print("Writing passage file for sentence '%s'..." %
                          outfile,
                          file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
    if splitter and len(splitter.matched_indices) < len(splitter.sentences):
        print("Unmatched sentences:",
              *[
                  s for i, s in enumerate(splitter.sentences)
                  if i not in splitter.matched_indices
              ],
              sep="\n")
Ejemplo n.º 5
0
def main(output = None, comment = False, sentence_level = False, categories = (), tokens = (), tokens_mode = CONSECUTIVE,
         case_insensitive = False, write = False, **kwargs):
    filtered_nodes = []
    for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(write=False, **kwargs):
        if sentence_level:
            cur_passages = convert.split2sentences(passage)
            all_nodes = [P.layer('1').heads[0] for P in cur_passages]
        else:
            all_nodes = list(passage.layer(layer1.LAYER_ID).all)
        for node in all_nodes:
            if comment and node.extra.get("remarks"):
                filtered_nodes.append(("comment",node,task_id,user_id))
            if tokens and not node.attrib.get("implicit"):
                unit_tokens = [t.text for t in node.get_terminals(punct=True)]
                if case_insensitive:
                    unit_tokens = [x.lower() for x in unit_tokens]
                    tokens = [x.lower() for x in tokens]
                if tokens_match(unit_tokens, tokens, tokens_mode):
                    filtered_nodes.append(('TOKENS', node, task_id, user_id))
            else:
                all_tags = []
                for edge in node:
                    all_tags.extend([c.tag for c in edge.categories])
                if all_tags:
                    intersection = set(categories) & set(all_tags)
                    if intersection:
                        filtered_nodes.append((str(list(intersection)), node, task_id, user_id))

    if output:
        with open(output, 'w') as f:
            for filter_type,node,task_id,user_id in filtered_nodes:
                ancestor = get_top_level_ancestor(node)
                print(filter_type, task_id, user_id, node.extra.get("tree_id"), node.to_text(),
                      ancestor, str(node.extra.get("remarks")).replace("\n","|"), file=f, sep="\t")
Ejemplo n.º 6
0
 def test_split(self):
     """Test that splitting a single-sentence Universal Dependencies tree converted to UCCA returns the same tree"""
     for passage, ref, _ in read_test_conllu():
         sentences = split2sentences(passage)
         self.assertEqual(len(sentences), 1,
                          "Should be one sentence: %s" % passage)
         sentence = sentences[0]
         self.convert_and_evaluate(sentence, ref)
Ejemplo n.º 7
0
 def test_split(self):
     """Test that splitting a single-sentence AMR converted to UCCA returns the same AMR"""
     for passage, ref, amr_id in read_test_amr():
         sentences = split2sentences(passage)
         self.assertEqual(len(sentences), 1,
                          "Should be one sentence: %s" % passage)
         sentence = sentences[0]
         converted = "\n".join(to_amr(sentence, metadata=False))
         scores = evaluate(converted, ref, amr_id)
         self.assertAlmostEqual(scores.f1, 1, msg=converted)
Ejemplo n.º 8
0
def convert_passage(filename, converter, args):
    """Opens a passage file and returns a string after conversion
    :param filename: input passage file
    :param converter: function to use for conversion
    :param args: ArgumentParser object
    """
    passage = file2passage(filename)
    passages = convert.split2sentences(passage) if args.sentences else [passage]
    output = "\n".join(line for p in passages for line in
                       converter(p, args.test, args.tree, args.markaux))
    return output, passage.ID
Ejemplo n.º 9
0
def convert_passage(filename, converter, args):
    """Opens a passage file and returns a string after conversion
    :param filename: input passage file
    :param converter: function to use for conversion
    :param args: ArgumentParser object
    """
    passage = file2passage(filename)
    passages = convert.split2sentences(passage) if args.sentences else [
        passage
    ]
    output = "\n".join(
        line for p in passages
        for line in converter(p, args.test, args.tree, args.markaux))
    return output, passage.ID
Ejemplo n.º 10
0
def main(args):
    splitter = Splitter.read_file(args.sentences, enum=args.enumerate)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(passage) if splitter else split2sentences(
                passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml"))
            with external_write_mode():
                print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
Ejemplo n.º 11
0
def test_split2sentences():
    """Tests splitting a passage by sentence ends.
    """
    p = multi_sent()
    split = convert.split2sentences(p)
    assert len(split) == 3
    terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
    assert terms[0] == ["1", "2", "3", "."]
    assert terms[1] == ["5", "6", "."]
    assert terms[2] == ["8", ".", "10", "."]
    assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all)
    top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
    for t in top_scenes:
        assert len(t) == 1
        assert t[0].incoming[0].tag == layer1.EdgeTags.ParallelScene
Ejemplo n.º 12
0
def main(args):
    order = None
    if args.sentences:
        with open(args.sentences, encoding="utf-8") as f:
            order = dict(map(reversed, enumerate(map(str.strip, f))))
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in split(passage, order) if order else split2sentences(
                passage, remarks=args.remarks, lang=args.lang):
            outfile = os.path.join(
                args.outdir, args.prefix + sentence.ID +
                (".pickle" if args.binary else ".xml"))
            with tqdm.external_write_mode():
                print("Writing passage file for sentence '%s'..." % outfile,
                      file=sys.stderr)
            passage2file(sentence, outfile, args.binary)
Ejemplo n.º 13
0
def write_passage(passage, args):
    ext = {
        None: UCCA_EXT[args.binary],
        "amr": ".txt"
    }.get(args.output_format) or "." + args.output_format
    outfile = args.outdir + os.path.sep + args.prefix + passage.ID + ext
    sys.stderr.write("Writing '%s'...\n" % outfile)
    if args.output_format is None:  # UCCA output
        ioutil.passage2file(passage, outfile, args.binary)
    else:
        converter = CONVERTERS[args.output_format][1]
        output = "\n".join(converter(passage)) if args.output_format == "amr" else \
            "\n".join(line for p in (convert.split2sentences(passage) if args.split else [passage]) for line in
                      converter(p, test=args.test, tree=args.tree, mark_aux=args.mark_aux))
        with open(outfile, "w", encoding="utf-8") as f:
            print(output, file=f)
Ejemplo n.º 14
0
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        t = split2sentences(passage)
        i = 0
        for sen in t:
            #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen)))
            print('sentence %d\n\n%s\n' % (i, convert.to_text(sen)))
            i += 1
            compunds = []
            for node in sen.nodes:
                if (sen.nodes[node].layer.ID == '0'):
                    find_id = ''
                    l = sen.nodes[node]
                    if (l.parents[0].ftag == 'C'):
                        if (l.parents[0].ID not in compunds):
                            compunds.append(l.parents[0].ID)
                            tmp_c = []
                            for n in l.parents[0].children:
                                tmp_c.append(n.text)
                            #print('Word: %s\nWord ID: %s' %(tmp_c,l.parents[0].ID))
                            find_id = l.parents[0].ID
                            path = []
                            path.append(' '.join(tmp_c))
                            path = find_path(sen.nodes[find_id], path)
                            print(' '.join(path))
                            '''
                            for j in path:
                                print(j)
                            '''
                            print('-------')

                    else:
                        #print('Word: %s\nWord ID: %s' % (l.text, l.ID))
                        find_id = l.ID
                        path = []
                        path = find_path(sen.nodes[find_id], path)
                        print(' '.join(path))
                        '''
                        for j in path:
                            print(j)
                        '''
                        print('-------')
            print(
                '------------------------------------------------------------------'
            )
Ejemplo n.º 15
0
def filter_nodes(categories=(), tokens=(), tokens_mode=CONSECUTIVE, case_insensitive=False, comment=False,
                 sentence_level=False, **kwargs):
    for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(**kwargs):
        for node in [p.layer(layer1.LAYER_ID).heads[0] for p in convert.split2sentences(passage)] if sentence_level \
                else passage.layer(layer1.LAYER_ID).all:
            if comment and node.extra.get("remarks"):
                yield "comment", node, task_id, user_id
            if tokens and not node.attrib.get("implicit"):
                unit_tokens = [t.text for t in node.get_terminals(punct=True)]
                if case_insensitive:
                    unit_tokens = [x.lower() for x in unit_tokens]
                    tokens = [x.lower() for x in tokens]
                if tokens_match(unit_tokens, tokens, tokens_mode):
                    yield 'TOKENS', node, task_id, user_id
            elif categories:
                intersection = set(categories).intersection(c.tag for e in node for c in e.categories)
                if intersection:
                    yield str(intersection), node, task_id, user_id
Ejemplo n.º 16
0
 def test_split2sentences(self):
     """Tests splitting a passage by sentence ends.
     """
     p = TestUtil.create_multi_passage()
     split = convert.split2sentences(p)
     self.assertEqual(len(split), 3)
     terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
     self.assertSequenceEqual(terms[0], ["1", "2", "3", "."])
     self.assertSequenceEqual(terms[1], ["5", "6", "."])
     self.assertSequenceEqual(terms[2], ["8", ".", "10", "."])
     self.assertTrue(all(t.paragraph == 1 for s in split[0:2]
                         for t in s.layer(layer0.LAYER_ID).all))
     self.assertTrue(all(t.paragraph == 2
                         for t in split[2].layer(layer0.LAYER_ID).all))
     top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
     for t in top_scenes:
         self.assertEqual(len(t), 1)
         self.assertEqual(t[0].incoming[0].tag, layer1.EdgeTags.ParallelScene)
Ejemplo n.º 17
0
 def test_split2sentences(self):
     """Tests splitting a passage by sentence ends.
     """
     p = TestUtil.create_multi_passage()
     split = convert.split2sentences(p)
     self.assertEqual(len(split), 3)
     terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
     self.assertSequenceEqual(terms[0], ["1", "2", "3", "."])
     self.assertSequenceEqual(terms[1], ["5", "6", "."])
     self.assertSequenceEqual(terms[2], ["8", ".", "10", "."])
     self.assertTrue(
         all(t.paragraph == 1 for s in split
             for t in s.layer(layer0.LAYER_ID).all))
     top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
     for t in top_scenes:
         self.assertEqual(len(t), 1)
         self.assertEqual(t[0].incoming[0].tag,
                          layer1.EdgeTags.ParallelScene)
Ejemplo n.º 18
0
def main(args):
    splitter = Splitter.read_file(args.sentences, enum=args.enumerate,
                                  suffix_format=args.suffix_format, suffix_start=args.suffix_start)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(passage) if splitter else split2sentences(
                passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print(sentence, file=sys.stderr)
                    print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
    if splitter and len(splitter.matched_indices) < len(splitter.sentences):
        print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences)
                                            if i not in splitter.matched_indices], sep="\n")
Ejemplo n.º 19
0
def test_split_join_sentences(create):
    p = create()
    split = convert.split2sentences(p, remarks=True)
    copy = convert.join_passages(split)
    diffutil.diff_passages(p, copy)
    assert p.equals(copy)
Ejemplo n.º 20
0
 def test_split_join_sentences(self):
     p = TestUtil.create_multi_passage()
     split = convert.split2sentences(p, remarks=True)
     copy = convert.join_passages(split)
     diffutil.diff_passages(p, copy)
     self.assertTrue(p.equals(copy))
Ejemplo n.º 21
0
 def test_split_join_sentences(self):
     p = TestUtil.create_multi_passage()
     split = convert.split2sentences(p, remarks=True)
     copy = convert.join_passages(split)
     diffutil.diff_passages(p, copy)
     self.assertTrue(p.equals(copy))
Ejemplo n.º 22
-1
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        #passage = convert.from_standard(elem)
        #print("Linearised\n------\n")
        # print(convert.to_sequence(passage))
        words = {}
        xmltoconll(passage)
        t = split2sentences(passage)
        i = 0
        for sen in t:
            print('sentence %d\n\n%s\n' % (i, convert.to_text(sen)))
            i += 1

        while (1):
            word = input('\nType the word below\n\n')
            for node in passage.nodes:
                t = passage.nodes[node]
                if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)):
                    #print('Word: %s\nWord ID: %s' %(t.text,t.ID))
                    #ans = input('\nDo you want to continue with wordi Id : %s', t.ID)
                    path = []
                    path = find_path(passage.nodes[t.ID], path)
                    break
            print(' '.join(path))
Ejemplo n.º 23
-1
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        t = split2sentences(passage)
        sen_no = 0
        for sen in t:
            #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen)))
            print('sentence %d\n\n%s\n' % (sen_no, convert.to_text(sen)))

            root = sen.nodes['1.1']
            first = 1
            tab_len = {}
            tab_len[0] = len('1.1')
            for i in root.children:
                print('\n')
                path = []
                level = 1
                path.append((i.ftag, i.ID, level, False))
                path = find_children(i, path, level)
                end = 0
                if (first):
                    pstr = root.ID
                    first = 0
                else:
                    for k in range(0, tab_len[0]):
                        pstr = pstr + ' '
                for j in path:
                    if (j == 'End'):
                        print(pstr)
                        pstr = ''
                        end = 1
                        continue
                    rel = j[0]
                    nd = j[1]
                    tab = int(j[2])
                    remote = j[3]
                    if (end):
                        q_mark = 0
                        for k in range(0, tab_len[tab - 1]):
                            if (k == tab_len[q_mark]):
                                pstr = pstr + '.'
                                q_mark += 1
                            else:
                                pstr = pstr + ' '
                            end = 0
                    if (rel in descr):
                        rel_desc = rel + ':' + descr[rel]
                    else:
                        rel_desc = rel
                    if (remote):
                        pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd
                    else:
                        pstr = pstr + '|-->(' + rel_desc + ')-->' + nd
                    tab_len[tab] = len(pstr)

            print('-----------------------------------\n')
            sen_no += 1