def main(args): if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) if not args.tikz: import matplotlib matplotlib.use('Agg') to_stdout = (args.tikz or args.standoff) and not args.out_dir t = args.passages t = get_passages(t) if to_stdout else get_passages_with_progress_bar( t, desc="Visualizing") if args.sentences: t = (sentence for passage in t for sentence in split2sentences(passage)) for passage in t: if args.tikz: print_text(args, visualization.tikz(passage), passage.ID + ".tikz.txt") elif args.standoff: print_text(args, visualization.standoff(passage), passage.ID + ".ann") else: import matplotlib.pyplot as plt width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27 plt.figure(passage.ID, figsize=(width, width * 10 / 19)) visualization.draw(passage, node_ids=args.node_ids) if args.out_dir: plt.savefig( os.path.join(args.out_dir, passage.ID + "." + args.format)) plt.close() else: plt.show()
def test_split(self): """Test that splitting a single-sentence SDP graph converted to UCCA returns the same SDP graph""" for passage, ref, _ in read_test_sdp(): sentences = split2sentences(passage) self.assertEqual(len(sentences), 1, "Should be one sentence: %s" % passage) sentence = sentences[0] self.convert_and_evaluate(sentence, ref)
def filter_nodes(categories=(), tokens=(), tokens_mode=CONSECUTIVE, case_insensitive=False, comment=False, sentence_level=False, **kwargs): for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks( **kwargs): for node in [p.layer(layer1.LAYER_ID).heads[0] for p in convert.split2sentences(passage)] if sentence_level \ else passage.layer(layer1.LAYER_ID).all: if comment and node.extra.get("remarks"): yield "comment", node, task_id, user_id if tokens and not node.attrib.get("implicit"): unit_tokens = [t.text for t in node.get_terminals(punct=True)] if case_insensitive: unit_tokens = [x.lower() for x in unit_tokens] tokens = [x.lower() for x in tokens] if tokens_match(unit_tokens, tokens, tokens_mode): yield 'TOKENS', node, task_id, user_id elif categories: intersection = set(categories).intersection( c.tag for e in node for c in e.categories) if intersection: yield str(intersection), node, task_id, user_id
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split( passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("Unmatched sentences:", *[ s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices ], sep="\n")
def main(output = None, comment = False, sentence_level = False, categories = (), tokens = (), tokens_mode = CONSECUTIVE, case_insensitive = False, write = False, **kwargs): filtered_nodes = [] for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(write=False, **kwargs): if sentence_level: cur_passages = convert.split2sentences(passage) all_nodes = [P.layer('1').heads[0] for P in cur_passages] else: all_nodes = list(passage.layer(layer1.LAYER_ID).all) for node in all_nodes: if comment and node.extra.get("remarks"): filtered_nodes.append(("comment",node,task_id,user_id)) if tokens and not node.attrib.get("implicit"): unit_tokens = [t.text for t in node.get_terminals(punct=True)] if case_insensitive: unit_tokens = [x.lower() for x in unit_tokens] tokens = [x.lower() for x in tokens] if tokens_match(unit_tokens, tokens, tokens_mode): filtered_nodes.append(('TOKENS', node, task_id, user_id)) else: all_tags = [] for edge in node: all_tags.extend([c.tag for c in edge.categories]) if all_tags: intersection = set(categories) & set(all_tags) if intersection: filtered_nodes.append((str(list(intersection)), node, task_id, user_id)) if output: with open(output, 'w') as f: for filter_type,node,task_id,user_id in filtered_nodes: ancestor = get_top_level_ancestor(node) print(filter_type, task_id, user_id, node.extra.get("tree_id"), node.to_text(), ancestor, str(node.extra.get("remarks")).replace("\n","|"), file=f, sep="\t")
def test_split(self): """Test that splitting a single-sentence Universal Dependencies tree converted to UCCA returns the same tree""" for passage, ref, _ in read_test_conllu(): sentences = split2sentences(passage) self.assertEqual(len(sentences), 1, "Should be one sentence: %s" % passage) sentence = sentences[0] self.convert_and_evaluate(sentence, ref)
def test_split(self): """Test that splitting a single-sentence AMR converted to UCCA returns the same AMR""" for passage, ref, amr_id in read_test_amr(): sentences = split2sentences(passage) self.assertEqual(len(sentences), 1, "Should be one sentence: %s" % passage) sentence = sentences[0] converted = "\n".join(to_amr(sentence, metadata=False)) scores = evaluate(converted, ref, amr_id) self.assertAlmostEqual(scores.f1, 1, msg=converted)
def convert_passage(filename, converter, args): """Opens a passage file and returns a string after conversion :param filename: input passage file :param converter: function to use for conversion :param args: ArgumentParser object """ passage = file2passage(filename) passages = convert.split2sentences(passage) if args.sentences else [passage] output = "\n".join(line for p in passages for line in converter(p, args.test, args.tree, args.markaux)) return output, passage.ID
def convert_passage(filename, converter, args): """Opens a passage file and returns a string after conversion :param filename: input passage file :param converter: function to use for conversion :param args: ArgumentParser object """ passage = file2passage(filename) passages = convert.split2sentences(passage) if args.sentences else [ passage ] output = "\n".join( line for p in passages for line in converter(p, args.test, args.tree, args.markaux)) return output, passage.ID
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary)
def test_split2sentences(): """Tests splitting a passage by sentence ends. """ p = multi_sent() split = convert.split2sentences(p) assert len(split) == 3 terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] assert terms[0] == ["1", "2", "3", "."] assert terms[1] == ["5", "6", "."] assert terms[2] == ["8", ".", "10", "."] assert all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all) top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] for t in top_scenes: assert len(t) == 1 assert t[0].incoming[0].tag == layer1.EdgeTags.ParallelScene
def main(args): order = None if args.sentences: with open(args.sentences, encoding="utf-8") as f: order = dict(map(reversed, enumerate(map(str.strip, f)))) for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in split(passage, order) if order else split2sentences( passage, remarks=args.remarks, lang=args.lang): outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with tqdm.external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) passage2file(sentence, outfile, args.binary)
def write_passage(passage, args): ext = { None: UCCA_EXT[args.binary], "amr": ".txt" }.get(args.output_format) or "." + args.output_format outfile = args.outdir + os.path.sep + args.prefix + passage.ID + ext sys.stderr.write("Writing '%s'...\n" % outfile) if args.output_format is None: # UCCA output ioutil.passage2file(passage, outfile, args.binary) else: converter = CONVERTERS[args.output_format][1] output = "\n".join(converter(passage)) if args.output_format == "amr" else \ "\n".join(line for p in (convert.split2sentences(passage) if args.split else [passage]) for line in converter(p, test=args.test, tree=args.tree, mark_aux=args.mark_aux)) with open(outfile, "w", encoding="utf-8") as f: print(output, file=f)
def main(args): for passage in get_passages_with_progress_bar(args.passages): t = split2sentences(passage) i = 0 for sen in t: #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen))) print('sentence %d\n\n%s\n' % (i, convert.to_text(sen))) i += 1 compunds = [] for node in sen.nodes: if (sen.nodes[node].layer.ID == '0'): find_id = '' l = sen.nodes[node] if (l.parents[0].ftag == 'C'): if (l.parents[0].ID not in compunds): compunds.append(l.parents[0].ID) tmp_c = [] for n in l.parents[0].children: tmp_c.append(n.text) #print('Word: %s\nWord ID: %s' %(tmp_c,l.parents[0].ID)) find_id = l.parents[0].ID path = [] path.append(' '.join(tmp_c)) path = find_path(sen.nodes[find_id], path) print(' '.join(path)) ''' for j in path: print(j) ''' print('-------') else: #print('Word: %s\nWord ID: %s' % (l.text, l.ID)) find_id = l.ID path = [] path = find_path(sen.nodes[find_id], path) print(' '.join(path)) ''' for j in path: print(j) ''' print('-------') print( '------------------------------------------------------------------' )
def filter_nodes(categories=(), tokens=(), tokens_mode=CONSECUTIVE, case_insensitive=False, comment=False, sentence_level=False, **kwargs): for passage, task_id, user_id in TaskDownloader(**kwargs).download_tasks(**kwargs): for node in [p.layer(layer1.LAYER_ID).heads[0] for p in convert.split2sentences(passage)] if sentence_level \ else passage.layer(layer1.LAYER_ID).all: if comment and node.extra.get("remarks"): yield "comment", node, task_id, user_id if tokens and not node.attrib.get("implicit"): unit_tokens = [t.text for t in node.get_terminals(punct=True)] if case_insensitive: unit_tokens = [x.lower() for x in unit_tokens] tokens = [x.lower() for x in tokens] if tokens_match(unit_tokens, tokens, tokens_mode): yield 'TOKENS', node, task_id, user_id elif categories: intersection = set(categories).intersection(c.tag for e in node for c in e.categories) if intersection: yield str(intersection), node, task_id, user_id
def test_split2sentences(self): """Tests splitting a passage by sentence ends. """ p = TestUtil.create_multi_passage() split = convert.split2sentences(p) self.assertEqual(len(split), 3) terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] self.assertSequenceEqual(terms[0], ["1", "2", "3", "."]) self.assertSequenceEqual(terms[1], ["5", "6", "."]) self.assertSequenceEqual(terms[2], ["8", ".", "10", "."]) self.assertTrue(all(t.paragraph == 1 for s in split[0:2] for t in s.layer(layer0.LAYER_ID).all)) self.assertTrue(all(t.paragraph == 2 for t in split[2].layer(layer0.LAYER_ID).all)) top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] for t in top_scenes: self.assertEqual(len(t), 1) self.assertEqual(t[0].incoming[0].tag, layer1.EdgeTags.ParallelScene)
def test_split2sentences(self): """Tests splitting a passage by sentence ends. """ p = TestUtil.create_multi_passage() split = convert.split2sentences(p) self.assertEqual(len(split), 3) terms = [[t.text for t in s.layer(layer0.LAYER_ID).all] for s in split] self.assertSequenceEqual(terms[0], ["1", "2", "3", "."]) self.assertSequenceEqual(terms[1], ["5", "6", "."]) self.assertSequenceEqual(terms[2], ["8", ".", "10", "."]) self.assertTrue( all(t.paragraph == 1 for s in split for t in s.layer(layer0.LAYER_ID).all)) top_scenes = [s.layer(layer1.LAYER_ID).top_scenes for s in split] for t in top_scenes: self.assertEqual(len(t), 1) self.assertEqual(t[0].incoming[0].tag, layer1.EdgeTags.ParallelScene)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(sentence, file=sys.stderr) print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices], sep="\n")
def test_split_join_sentences(create): p = create() split = convert.split2sentences(p, remarks=True) copy = convert.join_passages(split) diffutil.diff_passages(p, copy) assert p.equals(copy)
def test_split_join_sentences(self): p = TestUtil.create_multi_passage() split = convert.split2sentences(p, remarks=True) copy = convert.join_passages(split) diffutil.diff_passages(p, copy) self.assertTrue(p.equals(copy))
def main(args): for passage in get_passages_with_progress_bar(args.passages): #passage = convert.from_standard(elem) #print("Linearised\n------\n") # print(convert.to_sequence(passage)) words = {} xmltoconll(passage) t = split2sentences(passage) i = 0 for sen in t: print('sentence %d\n\n%s\n' % (i, convert.to_text(sen))) i += 1 while (1): word = input('\nType the word below\n\n') for node in passage.nodes: t = passage.nodes[node] if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)): #print('Word: %s\nWord ID: %s' %(t.text,t.ID)) #ans = input('\nDo you want to continue with wordi Id : %s', t.ID) path = [] path = find_path(passage.nodes[t.ID], path) break print(' '.join(path))
def main(args): for passage in get_passages_with_progress_bar(args.passages): t = split2sentences(passage) sen_no = 0 for sen in t: #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen))) print('sentence %d\n\n%s\n' % (sen_no, convert.to_text(sen))) root = sen.nodes['1.1'] first = 1 tab_len = {} tab_len[0] = len('1.1') for i in root.children: print('\n') path = [] level = 1 path.append((i.ftag, i.ID, level, False)) path = find_children(i, path, level) end = 0 if (first): pstr = root.ID first = 0 else: for k in range(0, tab_len[0]): pstr = pstr + ' ' for j in path: if (j == 'End'): print(pstr) pstr = '' end = 1 continue rel = j[0] nd = j[1] tab = int(j[2]) remote = j[3] if (end): q_mark = 0 for k in range(0, tab_len[tab - 1]): if (k == tab_len[q_mark]): pstr = pstr + '.' q_mark += 1 else: pstr = pstr + ' ' end = 0 if (rel in descr): rel_desc = rel + ':' + descr[rel] else: rel_desc = rel if (remote): pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd else: pstr = pstr + '|-->(' + rel_desc + ')-->' + nd tab_len[tab] = len(pstr) print('-----------------------------------\n') sen_no += 1