def annotate_udpipe(passages, model_name, as_array=True, verbose=False): if model_name: if not as_array: raise ValueError( "Annotating with UDPipe and as_array=False are currently not supported; use --as-array" ) t1, t2 = tee((paragraph, passage) for passage in passages for paragraph in split2paragraphs(passage)) paragraphs = map(itemgetter(0), t1) passages = map(itemgetter(1), t2) for key, group in groupby(zip( passages, parse_udpipe(paragraphs, model_name, verbose, annotate=True)), key=itemgetter(0)): passage = key for passage, (paragraph, annotated) in group: # noinspection PyUnresolvedReferences l0 = annotated.layer(layer0.LAYER_ID) if l0.all: i = next( iter(t.extra["orig_paragraph"] for t in paragraph.layer(layer0.LAYER_ID).all)) passage.layer(layer0.LAYER_ID).doc(i)[:] = l0.doc(1) yield passage else: yield from passages
def test_split2paragraphs(): """Tests splitting a passage by paragraph ends. """ p = multi_sent() split = convert.split2paragraphs(p) assert len(split) == 2 terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] assert terms[0] == ["1", "2", "3", ".", "5", "6", "."] assert terms[1] == ["8", ".", "10", "."] assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all) top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] assert len(top_scenes[0]) == 2 assert len(top_scenes[1]) == 1 for t in top_scenes: for n in t: assert n.incoming[0].tag == layer1.EdgeTags.ParallelScene
def test_split2paragraphs(self): """Tests splitting a passage by paragraph ends. """ p = TestUtil.create_multi_passage() split = convert.split2paragraphs(p) self.assertEqual(len(split), 2) terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] self.assertSequenceEqual(terms[0], ["1", "2", "3", ".", "5", "6", "."]) self.assertSequenceEqual(terms[1], ["8", ".", "10", "."]) self.assertTrue( all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all)) top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] self.assertEqual(len(top_scenes[0]), 2) self.assertEqual(len(top_scenes[1]), 1) for t in top_scenes: for n in t: self.assertEqual(n.incoming[0].tag, layer1.EdgeTags.ParallelScene)
def test_split2paragraphs(self): """Tests splitting a passage by paragraph ends. """ p = TestUtil.create_multi_passage() split = convert.split2paragraphs(p) self.assertEqual(len(split), 2) terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] self.assertSequenceEqual(terms[0], ["1", "2", "3", ".", "5", "6", "."]) self.assertSequenceEqual(terms[1], ["8", ".", "10", "."]) self.assertTrue(all(t.paragraph == 1 for t in split[0].layer(layer0.LAYER_ID).all)) self.assertTrue(all(t.paragraph == 2 for t in split[1].layer(layer0.LAYER_ID).all)) top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] self.assertEqual(len(top_scenes[0]), 2) self.assertEqual(len(top_scenes[1]), 1) for t in top_scenes: for n in t: self.assertEqual(n.incoming[0].tag, layer1.EdgeTags.ParallelScene)
def main(args): os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for paragraph in split2paragraphs( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + paragraph.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(paragraph, file=sys.stderr) print("Writing passage file for paragraph '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(paragraph) passage2file(paragraph, outfile, binary=args.binary)
def split(passage): try: return split2paragraphs(passage) except KeyError as e: raise RuntimeError("Failed splitting passage " + passage.ID) from e
def test_split_join_paragraphs(create): p = create() split = convert.split2paragraphs(p, remarks=True) copy = convert.join_passages(split) diffutil.diff_passages(p, copy) assert p.equals(copy)
def test_split_join_paragraphs(self): p = TestUtil.create_multi_passage() split = convert.split2paragraphs(p, remarks=True) copy = convert.join_passages(split) diffutil.diff_passages(p, copy) self.assertTrue(p.equals(copy))