Beispiel #1
0
def annotate_udpipe(passages, model_name, as_array=True, verbose=False):
    if model_name:
        if not as_array:
            raise ValueError(
                "Annotating with UDPipe and as_array=False are currently not supported; use --as-array"
            )
        t1, t2 = tee((paragraph, passage) for passage in passages
                     for paragraph in split2paragraphs(passage))
        paragraphs = map(itemgetter(0), t1)
        passages = map(itemgetter(1), t2)
        for key, group in groupby(zip(
                passages,
                parse_udpipe(paragraphs, model_name, verbose, annotate=True)),
                                  key=itemgetter(0)):
            passage = key
            for passage, (paragraph, annotated) in group:
                # noinspection PyUnresolvedReferences
                l0 = annotated.layer(layer0.LAYER_ID)
                if l0.all:
                    i = next(
                        iter(t.extra["orig_paragraph"]
                             for t in paragraph.layer(layer0.LAYER_ID).all))
                    passage.layer(layer0.LAYER_ID).doc(i)[:] = l0.doc(1)
            yield passage
    else:
        yield from passages
Beispiel #2
0
def test_split2paragraphs():
    """Tests splitting a passage by paragraph ends.
    """
    p = multi_sent()
    split = convert.split2paragraphs(p)
    assert len(split) == 2
    terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
    assert terms[0] == ["1", "2", "3", ".", "5", "6", "."]
    assert terms[1] == ["8", ".", "10", "."]
    assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all)
    top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
    assert len(top_scenes[0]) == 2
    assert len(top_scenes[1]) == 1
    for t in top_scenes:
        for n in t:
            assert n.incoming[0].tag == layer1.EdgeTags.ParallelScene
Beispiel #3
0
 def test_split2paragraphs(self):
     """Tests splitting a passage by paragraph ends.
     """
     p = TestUtil.create_multi_passage()
     split = convert.split2paragraphs(p)
     self.assertEqual(len(split), 2)
     terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
     self.assertSequenceEqual(terms[0], ["1", "2", "3", ".", "5", "6", "."])
     self.assertSequenceEqual(terms[1], ["8", ".", "10", "."])
     self.assertTrue(
         all(t.paragraph == 1 for s in split
             for t in s.layer(layer0.LAYER_ID).all))
     top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
     self.assertEqual(len(top_scenes[0]), 2)
     self.assertEqual(len(top_scenes[1]), 1)
     for t in top_scenes:
         for n in t:
             self.assertEqual(n.incoming[0].tag,
                              layer1.EdgeTags.ParallelScene)
Beispiel #4
0
 def test_split2paragraphs(self):
     """Tests splitting a passage by paragraph ends.
     """
     p = TestUtil.create_multi_passage()
     split = convert.split2paragraphs(p)
     self.assertEqual(len(split), 2)
     terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split]
     self.assertSequenceEqual(terms[0], ["1", "2", "3", ".", "5", "6", "."])
     self.assertSequenceEqual(terms[1], ["8", ".", "10", "."])
     self.assertTrue(all(t.paragraph == 1
                         for t in split[0].layer(layer0.LAYER_ID).all))
     self.assertTrue(all(t.paragraph == 2
                         for t in split[1].layer(layer0.LAYER_ID).all))
     top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split]
     self.assertEqual(len(top_scenes[0]), 2)
     self.assertEqual(len(top_scenes[1]), 1)
     for t in top_scenes:
         for n in t:
             self.assertEqual(n.incoming[0].tag, layer1.EdgeTags.ParallelScene)
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for paragraph in split2paragraphs(
                passage,
                remarks=args.remarks,
                lang=args.lang,
                ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(
                args.outdir, args.prefix + paragraph.ID +
                (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print(paragraph, file=sys.stderr)
                    print("Writing passage file for paragraph '%s'..." %
                          outfile,
                          file=sys.stderr)
            if args.normalize:
                normalize(paragraph)
            passage2file(paragraph, outfile, binary=args.binary)
Beispiel #6
0
def split(passage):
    try:
        return split2paragraphs(passage)
    except KeyError as e:
        raise RuntimeError("Failed splitting passage " + passage.ID) from e
Beispiel #7
0
def test_split_join_paragraphs(create):
    p = create()
    split = convert.split2paragraphs(p, remarks=True)
    copy = convert.join_passages(split)
    diffutil.diff_passages(p, copy)
    assert p.equals(copy)
Beispiel #8
0
 def test_split_join_paragraphs(self):
     p = TestUtil.create_multi_passage()
     split = convert.split2paragraphs(p, remarks=True)
     copy = convert.join_passages(split)
     diffutil.diff_passages(p, copy)
     self.assertTrue(p.equals(copy))
Beispiel #9
0
 def test_split_join_paragraphs(self):
     p = TestUtil.create_multi_passage()
     split = convert.split2paragraphs(p, remarks=True)
     copy = convert.join_passages(split)
     diffutil.diff_passages(p, copy)
     self.assertTrue(p.equals(copy))