def test_site_terminals(): elem = load_xml("test_files/site1.xml") passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all assert passage.ID == "118" assert len(terms) == 15 # There are two punctuation signs (dots, positions 5 and 11), which # also serve as paragraph end points. All others are words whose text # is their positions, so test that both text, punctuation (yes/no) # and paragraphs are converted correctly for i, t in enumerate(terms): # i starts in 0, positions at 1, hence 5,11 ==> 4,10 if i in (4, 10): assert t.text == "." and t.punct else: assert t.text == str(i + 1) and not t.punct if i < 5: par = 1 elif i < 11: par = 2 else: par = 3 assert t.paragraph == par
def test_site_terminals(self): elem = self._load_xml('./site1.xml') passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all self.assertEqual(passage.ID, '118') self.assertEqual(len(terms), 15) # There are two punctuation signs (dots, positions 5 and 11), which # also serve as paragraph end points. All others are words whose text # is their positions, so test that both text, punctuation (yes/no) # and paragraphs are converted correctly for i, t in enumerate(terms): # i starts in 0, positions at 1, hence 5,11 ==> 4,10 if i in (4, 10): self.assertTrue(t.text == '.' and t.punct is True) else: self.assertTrue(t.text == str(i + 1) and t.punct is False) if i < 5: par = 1 elif i < 11: par = 2 else: par = 3 self.assertEqual(t.paragraph, par)
def test_site_simple(self): elem = self._load_xml('./site2.xml') passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all l1 = passage.layer('1') # The Terminals in the passage are just like in test_site_terminals, # with this layer1 heirarchy: [[1 C] [2 E] L] [3 4 . H] # with the linker having a remark and the parallel scene is uncertain head = l1.heads[0] self.assertEqual(len(head), 12) # including all 'unused' terminals self.assertEqual(head[9].tag, layer1.EdgeTags.Linker) self.assertEqual(head[10].tag, layer1.EdgeTags.ParallelScene) linker = head.children[9] self._test_edges(linker, [layer1.EdgeTags.Center, layer1.EdgeTags.Elaborator]) self.assertTrue(linker.extra['remarks'], '"remark"') center = linker.children[0] elab = linker.children[1] self._test_terms(center, terms[0:1]) self._test_terms(elab, terms[1:2]) ps = head.children[10] self._test_edges(ps, [layer1.EdgeTags.Terminal, layer1.EdgeTags.Terminal, layer1.EdgeTags.Punctuation]) self.assertTrue(ps.attrib.get('uncertain')) self.assertEqual(ps.children[0], terms[2]) self.assertEqual(ps.children[1], terms[3]) self.assertEqual(ps.children[2].children[0], terms[4])
def test_to_standard(): passage = convert.from_site(load_xml("test_files/site3.xml")) ref = load_xml("test_files/standard3.xml") # old format of xml new_ref = convert.to_standard(convert.from_standard(ref)) # converting to the new xml format root = convert.to_standard(passage) assert (textutil.indent_xml(ETree.tostring(new_ref)).splitlines() == textutil.indent_xml(ETree.tostring(root)).splitlines())
def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.filename, encoding="utf-8") as f: t = list(map(str.split, f)) if not args.verbose: t = tqdm(t, desc="Downloading", unit=" passages") for passage_id, id_field in t: if not args.verbose: t.set_postfix({ "passage_id": passage_id, args.method: id_field }) if args.verbose: with external_write_mode(): print("Getting passage " + passage_id + " with " + args.method + "=" + id_field, end="\t") xml_root = get_by_method(id_field=id_field.split(","), passage_id=passage_id, **vars(args)) if xml_root is None: continue if args.write_site: site_filename = passage_id + "_site_download.xml" with open(site_filename, "w", encoding="utf-8") as fsite: print(tostring(xml_root).decode(), file=fsite) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename) if args.write: write_passage(convert.from_site(xml_root), outdir=args.outdir, verbose=args.verbose)
def test_site_simple(self): elem = TestUtil.load_xml("test_files/site2.xml") passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all l1 = passage.layer("1") # The Terminals in the passage are just like in test_site_terminals, # with this layer1 hierarchy: [[1 C] [2 E] L] [3 4 . H] # with the linker having a remark and the parallel scene is uncertain head = l1.heads[0] self.assertEqual(len(head), 12) # including all "unused" terminals self.assertEqual(head[9].tag, layer1.EdgeTags.Linker) self.assertEqual(head[10].tag, layer1.EdgeTags.ParallelScene) linker = head.children[9] self._test_edges(linker, [layer1.EdgeTags.Center, layer1.EdgeTags.Elaborator]) self.assertTrue(linker.extra["remarks"], '"remark"') center = linker.children[0] elab = linker.children[1] self._test_terms(center, terms[0:1]) self._test_terms(elab, terms[1:2]) ps = head.children[10] self._test_edges(ps, [ layer1.EdgeTags.Terminal, layer1.EdgeTags.Terminal, layer1.EdgeTags.Punctuation ]) self.assertTrue(ps.attrib.get("uncertain")) self.assertEqual(ps.children[0], terms[2]) self.assertEqual(ps.children[1], terms[3]) self.assertEqual(ps.children[2].children[0], terms[4])
def fix_tokenization(passage, words_set, lang, cw): tokenizer = get_tokenizer(lang=lang) elem = to_site(passage) state = State() ever_changed = False for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs): while True: changed = False terminals = list(paragraph.iter(SiteCfg.Tags.Terminal)) preterminals = get_parents(paragraph, terminals) preterminal_parents = get_parents(paragraph, preterminals) is_puncts = [ p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct for p in preterminals ] for i in false_indices(is_puncts): start, end = expand_to_neighboring_punct(i, is_puncts) if retokenize(i, start, end, terminals, preterminals, preterminal_parents, passage.ID, tokenizer, state, cw, words_set): ever_changed = changed = True break if not changed: break return from_site(elem) if ever_changed else None
def test_site_terminals(self): elem = TestUtil.load_xml("test_files/site1.xml") passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all self.assertEqual(passage.ID, "118") self.assertEqual(len(terms), 15) # There are two punctuation signs (dots, positions 5 and 11), which # also serve as paragraph end points. All others are words whose text # is their positions, so test that both text, punctuation (yes/no) # and paragraphs are converted correctly for i, t in enumerate(terms): # i starts in 0, positions at 1, hence 5,11 ==> 4,10 if i in (4, 10): self.assertTrue(t.text == "." and t.punct is True) else: self.assertTrue(t.text == str(i + 1) and t.punct is False) if i < 5: par = 1 elif i < 11: par = 2 else: par = 3 self.assertEqual(t.paragraph, par)
def test_site_simple(): elem = load_xml("test_files/site2.xml") passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all l1 = passage.layer("1") # The Terminals in the passage are just like in test_site_terminals, # with this layer1 hierarchy: [[1 C] [2 E] L] [3 4 . H] # with the linker having a remark and the parallel scene is uncertain head = l1.heads[0] assert len(head) == 12 # including all "unused" terminals assert head[9].tag == layer1.EdgeTags.Linker assert head[10].tag == layer1.EdgeTags.ParallelScene linker = head.children[9] _test_edges(linker, [layer1.EdgeTags.Center, layer1.EdgeTags.Elaborator]) assert linker.extra["remarks"], '"remark"' center = linker.children[0] elab = linker.children[1] _test_terms(center, terms[0:1]) _test_terms(elab, terms[1:2]) ps = head.children[10] _test_edges(ps, [ layer1.EdgeTags.Terminal, layer1.EdgeTags.Terminal, layer1.EdgeTags.Punctuation ]) assert ps.attrib.get("uncertain") assert ps.children[0] == terms[2] assert ps.children[1] == terms[3] assert ps.children[2].children[0] == terms[4]
def main(): opt_parser = cmd_line_parser() (options, args) = opt_parser.parse_args() if len(args) > 0: opt_parser.error("all arguments must be flagged") if (options.guessed is None) or (options.ref is None) or (options.db_filename is None): opt_parser.error("missing arguments. type --help for help.") if (options.pid is not None and options.from_xids is not None): opt_parser.error("inconsistent parameters. \ you can't have both a pid and from_xids paramters.") keys = [options.guessed, options.ref] if options.from_xids: xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys) else: xmls = ucca_db.get_xml_trees(options.db_filename, options.host, options.pid, keys) guessed, ref = [convert.from_site(x) for x in xmls] if options.units or options.fscore or options.errors: evaluate(guessed, ref, units=options.units, fscore=options.fscore, errors=options.errors, verbose=True)
def main(args): keys = [args.guessed, args.ref] xmls = api.get_by_xids(db_name=args.db_filename, host_name=args.host, xids=keys) if args.from_xids else \ api.get_xml_trees(db_name=args.db_filename, host_name=args.host, pid=args.pid, usernames=keys) guessed, ref = [convert.from_site(x) for x in xmls] if args.units or args.fscore or args.errors: evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, constructions=args.constructions, verbose=True)
def main(argv): t = tqdm(globals()[argv[1]]("pgserver", "work", *argv[2:]), unit=" passages", desc="Downloading XMLs") for xml in t: p = convert.from_site(xml) t.set_postfix(ID=p.ID) convert.passage2file(p, p.ID + ".xml")
def test_site_discontiguous_with_remote(): elem = load_xml("test_files/site4.xml") passage = convert.from_site(elem) s1 = passage.layer(layer1.LAYER_ID).heads[0].state remote_a1 = [e.child for e in s1 if e.attrib.get("remote") and e.tag == layer1.EdgeTags.Participant] assert len(remote_a1) == 1 a1 = remote_a1[0] remote_a2 = [e.child for e in a1 if e.attrib.get("remote") and e.tag == layer1.EdgeTags.Participant] assert len(remote_a2) == 1
def test_site_discontiguous_with_implicit(): elem = load_xml("test_files/site5.xml") passage = convert.from_site(elem) s1 = passage.layer(layer1.LAYER_ID).heads[0].state remote_t1 = [ e.child for e in s1 if e.child.attrib.get("implicit") and e.tag == layer1.EdgeTags.Time ] assert len(remote_t1) == 1
def print_passages_to_file(host_name, db_name, paids, write_xml=False, write_site_xml=False, prefix='', start_index=0): """ Returns for that user a list of submitted passages and a list of assigned but not submitted passages. Each passage is given in the format: (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>, <number of tokens in the passage>, <number of units in the passage>, <number of scenes in the passage>, <average length of a scene>). It also returns a distribution of the categories. write_xml: determines whether to write it to a file, named <prefix><the number of the xml>.xml skip_first: the index of the passage where it should start looking (the ones before are skipped) """ c = get_cursor(host_name, db_name) for paid in paids: if paid < start_index: # skipping training passages continue c.execute("SELECT passage,source FROM passages WHERE id=%s", (paid, )) r = c.fetchone() if r is not None: source = r[1] c.execute( "SELECT id, xml,uid,ts FROM xmls WHERE paid=%s ORDER BY ts DESC", (paid, )) r = c.fetchone() if r is not None: xid = r[0] uid = r[2] ts = r[3] print('\t'.join( [str(paid), str(uid), str(source), str(xid), str(ts)])) if write_site_xml: f = open(prefix + str(paid) + '_site.xml', 'w', encoding='utf-8') f.write(r[1] + '\n') f.close() # noinspection PyBroadException try: ucca_dag = convert.from_site(fromstring(r[1])) except Exception: sys.stderr.write("Skipped xid,paid " + str((xid, paid)) + "\n") continue if write_xml: f = open(prefix + str(paid) + '.xml', 'w') f.write(tostring(convert.to_standard(ucca_dag)).decode()) f.close()
def main(): db_name = sys.argv[1] with open(db_name + '.xids') as f: xids = tuple(int(x.strip()) for x in f.readlines()) conn = sqlite3.connect(db_name + '.db') c = conn.cursor() print("SELECT xml FROM xmls WHERE id IN " + str(xids)) c.execute("SELECT xml FROM xmls WHERE id IN " + str(xids)) passages = [convert.from_site(ETree.fromstring(x[0])) for x in c] print(set(p.ID for p in passages)) with open(db_name + '.pickle', 'wb') as f: pickle.dump(passages, f)
def get_predicates(host_name, db_name, only_complex=True, start_index=100): """ Returns a list of all the predicates in the UCCA corpus. usernames -- the names of the users whose completed passages we should take. only_complex -- only the multi-word predicates will be returned. start_index -- the minimal passage number to be taken into account. """ def _complex(u): "Returns True if u is complex, i.e., if it has more than one child which is not an F or punct" if u is None or u.tag != 'FN': return False non_function_count = 0 non_function_u = None for e in u.outgoing: if e.child.tag == 'FN' and e.tag != 'F': non_function_count += 1 non_function_u = e.child return True if non_function_count > 1 else _complex(non_function_u) predicate_distribution = collections.Counter() c = get_cursor(host_name, db_name) # uid = get_uid(host_name, db_name, username) # get all the completed xmls c.execute("SELECT id, xml FROM xmls WHERE status=%s AND reviewOf<>%s ORDER BY ts DESC", (1, -1)) L = c.fetchall() wspace = re.compile("\\s+") with open('preds', 'w') as f: for r in tqdm(L): xid = r[0] try: ucca_dag = convert.from_site(fromstring(r[1])) except Exception: print("Skipped.", file=sys.stderr) continue # gathering statistics scenes = [x for x in ucca_dag.layer("1").all if x.tag == "FN" and x.is_scene()] temp = [] for sc in scenes: main_relation = sc.process if sc.process is not None else sc.state if only_complex and not _complex(main_relation): continue try: print(main_relation.to_text(), file=f) except UnicodeEncodeError: print("Skipped (encoding issue).", file=sys.stderr) continue
def test_site_advanced(self): elem = self._load_xml('./site3.xml') passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all l1 = passage.layer('1') # This passage has the same terminals as the simple and terminals test, # and have the same layer1 units for the first paragraph as the simple # test. In addition, it has the following annotation: # [6 7 8 9 H] [10 F] . # the 6-9 H has remote D which is [10 F]. Inside of 6-9, we have [8 S] # and [6 7 .. 9 A], where [6 E] and [7 .. 9 C]. # [12 H] [13 H] [14 H] [15 L], where 15 linkage links 12, 13 and 14 and # [15 L] has an implicit Center unit head, lkg = l1.heads self._test_edges(head, [layer1.EdgeTags.Linker, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.Function, layer1.EdgeTags.Punctuation, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.Linker]) # we only take what we haven't checked already ps1, func, punct, ps2, ps3, ps4, link = head.children[2:] self._test_edges(ps1, [layer1.EdgeTags.Participant, layer1.EdgeTags.Process, layer1.EdgeTags.Adverbial]) self.assertTrue(ps1[2].attrib.get('remote')) ps1_a, ps1_p, ps1_d = ps1.children self._test_edges(ps1_a, [layer1.EdgeTags.Elaborator, layer1.EdgeTags.Center]) self._test_terms(ps1_a.children[0], terms[5:6]) self._test_terms(ps1_a.children[1], terms[6:9:2]) self._test_terms(ps1_p, terms[7:8]) self.assertEqual(ps1_d, func) self._test_terms(func, terms[9:10]) self._test_terms(punct, terms[10:11]) self._test_terms(ps2, terms[11:12]) self._test_terms(ps3, terms[12:13]) self._test_terms(ps4, terms[13:14]) self.assertEqual(len(link), 2) self.assertEqual(link[0].tag, layer1.EdgeTags.Center) self.assertTrue(link.children[0].attrib.get('implicit')) self.assertEqual(link[1].tag, layer1.EdgeTags.Elaborator) self.assertEqual(link.children[1][0].tag, layer1.EdgeTags.Terminal) self.assertEqual(link.children[1][0].child, terms[14]) self.assertEqual(lkg.relation, link) self.assertSequenceEqual(lkg.arguments, [ps2, ps3, ps4])
def test_site_advanced(): elem = load_xml("test_files/site3.xml") passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all l1 = passage.layer("1") # This passage has the same terminals as the simple and terminals test, # and have the same layer1 units for the first paragraph as the simple # test. In addition, it has the following annotation: # [6 7 8 9 H] [10 F] . # the 6-9 H has remote D which is [10 F]. Inside of 6-9, we have [8 S] # and [6 7 ... 9 A], where [6 E] and [7 ... 9 C]. # [12 H] [13 H] [14 H] [15 L], where 15 linkage links 12, 13 and 14 and # [15 L] has an implicit Center unit head, lkg = l1.heads _test_edges(head, [layer1.EdgeTags.Linker, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.Function, layer1.EdgeTags.Punctuation, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.Linker]) # we only take what we haven"t checked already ps1, func, punct, ps2, ps3, ps4, link = head.children[2:] _test_edges(ps1, [layer1.EdgeTags.Participant, layer1.EdgeTags.Process, layer1.EdgeTags.Adverbial]) assert ps1[2].attrib.get("remote") ps1_a, ps1_p, ps1_d = ps1.children _test_edges(ps1_a, [layer1.EdgeTags.Elaborator, layer1.EdgeTags.Center]) _test_terms(ps1_a.children[0], terms[5:6]) _test_terms(ps1_a.children[1], terms[6:9:2]) _test_terms(ps1_p, terms[7:8]) assert ps1_d == func _test_terms(func, terms[9:10]) _test_terms(punct, terms[10:11]) _test_terms(ps2, terms[11:12]) _test_terms(ps3, terms[12:13]) _test_terms(ps4, terms[13:14]) assert len(link) == 2 assert link[0].tag == layer1.EdgeTags.Center assert link.children[0].attrib.get("implicit") assert link[1].tag == layer1.EdgeTags.Elaborator assert link.children[1][0].tag == layer1.EdgeTags.Terminal assert link.children[1][0].child == terms[14] assert lkg.relation == link assert lkg.arguments == [ps2, ps3, ps4]
def test_site_advanced(self): elem = TestUtil.load_xml('test_files/site3.xml') passage = convert.from_site(elem) terms = passage.layer(layer0.LAYER_ID).all l1 = passage.layer('1') # This passage has the same terminals as the simple and terminals test, # and have the same layer1 units for the first paragraph as the simple # test. In addition, it has the following annotation: # [6 7 8 9 H] [10 F] . # the 6-9 H has remote D which is [10 F]. Inside of 6-9, we have [8 S] # and [6 7 ... 9 A], where [6 E] and [7 ... 9 C]. # [12 H] [13 H] [14 H] [15 L], where 15 linkage links 12, 13 and 14 and # [15 L] has an implicit Center unit head, lkg = l1.heads self._test_edges(head, [ layer1.EdgeTags.Linker, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.Function, layer1.EdgeTags.Punctuation, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene, layer1.EdgeTags.Linker ]) # we only take what we haven't checked already ps1, func, punct, ps2, ps3, ps4, link = head.children[2:] self._test_edges(ps1, [ layer1.EdgeTags.Participant, layer1.EdgeTags.Process, layer1.EdgeTags.Adverbial ]) self.assertTrue(ps1[2].attrib.get('remote')) ps1_a, ps1_p, ps1_d = ps1.children self._test_edges(ps1_a, [layer1.EdgeTags.Elaborator, layer1.EdgeTags.Center]) self._test_terms(ps1_a.children[0], terms[5:6]) self._test_terms(ps1_a.children[1], terms[6:9:2]) self._test_terms(ps1_p, terms[7:8]) self.assertEqual(ps1_d, func) self._test_terms(func, terms[9:10]) self._test_terms(punct, terms[10:11]) self._test_terms(ps2, terms[11:12]) self._test_terms(ps3, terms[12:13]) self._test_terms(ps4, terms[13:14]) self.assertEqual(len(link), 2) self.assertEqual(link[0].tag, layer1.EdgeTags.Center) self.assertTrue(link.children[0].attrib.get('implicit')) self.assertEqual(link[1].tag, layer1.EdgeTags.Elaborator) self.assertEqual(link.children[1][0].tag, layer1.EdgeTags.Terminal) self.assertEqual(link.children[1][0].child, terms[14]) self.assertEqual(lkg.relation, link) self.assertSequenceEqual(lkg.arguments, [ps2, ps3, ps4])
def run_file(path, eng): """Site XML file ==> prints list of sceneness results""" with open(path) as f: root = ETree.ElementTree().parse(f) passage = convert.from_site(root) words = [x.text for x in passage.layer('0').words] print(' '.join(words)) for word in words: all_tagsets = eng.get_forms(word) all_postags = set() for tagset in all_tagsets.values(): all_postags.update(tagset) print('{}\t{}'.format(word, all_postags)) if eng.is_dual_vn(word): print(all_tagsets) print('========')
def run_file(path, eng, stats): """Site XML file ==> prints list of sceneness results""" with open(path) as f: root = ETree.ElementTree().parse(f) passage = convert.from_site(root) sc = scenes.extract_possible_scenes(passage) heads = [scenes.extract_head(x) for x in sc] for s, h in zip(sc, heads): if h is None: stats.heads.append(Result(s)) continue out = eng.get_categories(s, h) if out == 'implicit': stats.heads.append(Result(s)) elif out == 'no base form': stats.lemmas.append(Result(s, h)) elif out[2]: stats.fulls.append(Result(s, h, *out)) else: stats.no_cats.append(Result(s, h, *out))
def main(): opt_parser = cmd_line_parser() (options, args) = opt_parser.parse_args() if len(args) > 0: opt_parser.error("all arguments must be flagged") if (options.guessed is None) or (options.ref is None) or (options.db_filename is None): opt_parser.error("missing arguments. type --help for help.") if options.pid is not None and options.from_xids is not None: opt_parser.error("inconsistent parameters. \ you can't have both a pid and from_xids paramters.") keys = [options.guessed, options.ref] if options.from_xids: xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys) else: xmls = ucca_db.get_xml_trees(options.db_filename, options.host, options.pid, keys) guessed, ref = [convert.from_site(x) for x in xmls] if options.units or options.fscore or options.errors: evaluate(guessed, ref, units=options.units, fscore=options.fscore, errors=options.errors, verbose=True)
def fix_tokenization(passage, words_set, lang, cw): tokenizer = get_tokenizer(lang=lang) elem = to_site(passage) state = State() ever_changed = False for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs): while True: changed = False terminals = list(paragraph.iter(SiteCfg.Tags.Terminal)) preterminals = get_parents(paragraph, terminals) preterminal_parents = get_parents(paragraph, preterminals) is_puncts = [p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct for p in preterminals] for i in false_indices(is_puncts): start, end = expand_to_neighboring_punct(i, is_puncts) if retokenize(i, start, end, terminals, preterminals, preterminal_parents, passage.ID, tokenizer, state, cw, words_set): ever_changed = changed = True break if not changed: break return from_site(elem) if ever_changed else None
def test_to_standard(): passage = convert.from_site(load_xml("test_files/site3.xml")) ref = load_xml("test_files/standard3.xml") # old format of xml new_ref = convert.to_standard(convert.from_standard(ref)) # converting to the new xml format root = convert.to_standard(passage) assert ETree.tostring(new_ref) == ETree.tostring(root)
def test_to_standard(self): passage = convert.from_site(TestUtil.load_xml("test_files/site3.xml")) ref = TestUtil.load_xml("test_files/standard3.xml") root = convert.to_standard(passage) self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
def test_to_site(): passage = loaded() root = convert.to_site(passage) copy = convert.from_site(root) assert passage.equals(copy)
def test_from_standard(): passage = loaded() ref = convert.from_site(load_xml("test_files/site3.xml")) assert passage.equals(ref, ordered=True)
def test_from_standard(self): passage = convert.from_standard(self._load_xml('./standard3.xml')) ref = convert.from_site(self._load_xml('./site3.xml')) self.assertTrue(passage.equals(ref, ordered=True))
def get_tasks(db, host, username): """ Returns for that user a list of submitted passages and a list of assigned but not submitted passages. Each passage is given in the format: (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>, <number of tokens in the passage>, <number of units in the passage>, <number of scenes in the passage>, <average length of a scene>). It also returns a distribution of the categories. """ output_submitted = [] category_distribution = Counter() # the categories of scenes. can be A, E or H scene_distribution = Counter() uid = get_uid(db, username) cur = get_cursor(db, username) cur.execute("SELECT pid,status FROM tasks WHERE uid=" + PLACE_HOLDER, (uid,)) r = cur.fetchall() submitted_paids = [x[0] for x in r if x[1] == 1] incomplete_paids = [x[0] for x in r if x[1] == 0] wspace = re.compile("\\s+") for paid in submitted_paids: sum_scene_length = 0 if paid < 100: # skipping training passages continue cur.execute("SELECT passage,source FROM passages WHERE id=" + PLACE_HOLDER, (paid,)) r = cur.fetchone() if r: num_tokens = len(wspace.split(r[0])) - 1 source = r[1] cur.execute("SELECT id, xml FROM xmls WHERE paid=" + PLACE_HOLDER + " AND uid=" + PLACE_HOLDER + " AND status=" + PLACE_HOLDER + " ORDER BY ts DESC", (paid, uid, 1)) r = cur.fetchone() if r: xid = r[0] # noinspection PyBroadException try: ucca_dag = convert.from_site(fromstring(r[1])) except Exception: sys.stderr.write("Skipped.\n") continue num_units = len([x for x in ucca_dag.layer(layer1.LAYER_ID).all if x.tag == NT.Foundational]) - 1 for node in ucca_dag.layer(layer1.LAYER_ID).all: category_distribution.update([e.tag for e in node if e.tag not in [ET.Punctuation, ET.LinkArgument, ET.LinkRelation, ET.Terminal]]) # getting the scene categories scenes = [x for x in ucca_dag.layer(layer1.LAYER_ID).all if x.tag == NT.Foundational and x.is_scene()] scene_distribution.update([linkage_type(sc) for sc in scenes]) sum_scene_length += sum([unit_length(x) for x in scenes]) output_submitted.append((paid, source, xid, num_tokens, num_units, len(scenes), 1.0 * sum_scene_length / len(scenes))) return output_submitted, category_distribution, scene_distribution
def test_from_standard(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) ref = convert.from_site(TestUtil.load_xml("test_files/site3.xml")) self.assertTrue(passage.equals(ref, ordered=True))
def test_to_site(self): passage = convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
output.append(output2) return (output) def get_sentences(P): """ P is the output of the simplification system. Return all the sentences in each passage """ dirpath = '/Mypath/System_output' folder = nltk.data.find(dirpath) corpusReader = nltk.corpus.PlaintextCorpusReader(folder, P) d = len(corpusReader.sents()) return (corpusReader.sents()[:d]) index = list(range(0, 100)) for t in index: f1 = open('UCCAannotated_source/%s.xml' % t) xml_string1 = f1.read() f1.close() xml_object1 = fromstring(xml_string1) P1 = convert.from_site(xml_object1) L1 = get_scenes(P1) L2 = get_sentences('%s.txt' % t) s = open('s%s.txt' % t, 'w') s.write('%s\n' % L1) s.write('%s\n' % L2) s.close()
def get_tasks(db, host, username): """ Returns for that user a list of submitted passages and a list of assigned but not submitted passages. Each passage is given in the format: (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>, <number of tokens in the passage>, <number of units in the passage>, <number of scenes in the passage>, <average length of a scene>). It also returns a distribution of the categories. """ output_submitted = [] category_distribution = Counter() # the categories of scenes. can be A, E or H scene_distribution = Counter() uid = get_uid(db, username) cur = get_cursor(db, username) cur.execute("SELECT pid,status FROM tasks WHERE uid=" + PLACE_HOLDER, (uid, )) r = cur.fetchall() submitted_paids = [x[0] for x in r if x[1] == 1] incomplete_paids = [x[0] for x in r if x[1] == 0] wspace = re.compile("\\s+") for paid in submitted_paids: sum_scene_length = 0 if paid < 100: # skipping training passages continue cur.execute( "SELECT passage,source FROM passages WHERE id=" + PLACE_HOLDER, (paid, )) r = cur.fetchone() if r: num_tokens = len(wspace.split(r[0])) - 1 source = r[1] cur.execute( "SELECT id, xml FROM xmls WHERE paid=" + PLACE_HOLDER + " AND uid=" + PLACE_HOLDER + " AND status=" + PLACE_HOLDER + " ORDER BY ts DESC", (paid, uid, 1)) r = cur.fetchone() if r: xid = r[0] # noinspection PyBroadException try: ucca_dag = convert.from_site(fromstring(r[1])) except Exception: sys.stderr.write("Skipped.\n") continue num_units = len([ x for x in ucca_dag.layer(layer1.LAYER_ID).all if x.tag == NT.Foundational ]) - 1 for node in ucca_dag.layer(layer1.LAYER_ID).all: category_distribution.update([ e.tag for e in node if e.tag not in [ ET.Punctuation, ET.LinkArgument, ET.LinkRelation, ET.Terminal ] ]) # getting the scene categories scenes = [ x for x in ucca_dag.layer(layer1.LAYER_ID).all if x.tag == NT.Foundational and x.is_scene() ] scene_distribution.update([linkage_type(sc) for sc in scenes]) sum_scene_length += sum([unit_length(x) for x in scenes]) output_submitted.append( (paid, source, xid, num_tokens, num_units, len(scenes), 1.0 * sum_scene_length / len(scenes))) return output_submitted, category_distribution, scene_distribution
def test_possible_scenes(self): """Tests that the API isn't broken, not validity of the result.""" elem = ConversionTests._load_xml('./site3.xml') passage = convert.from_site(elem) scenes.extract_possible_scenes(passage)
def test_to_site(self): passage = convert.from_standard(self._load_xml('./standard3.xml')) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
# return predicate_distribution def get_cursor(host_name, db_name): con = get_connection(db_name, host_name) c = con.cursor() c.execute("SET search_path TO oabend") return c def get_connection(db_name, host_name): global CONNECTION CONNECTION = psycopg2.connect(host=host_name, database=db_name) return CONNECTION # with open("ids.txt") as f_ids: # for i in tqdm(list(f_ids), unit=" passages", desc="Downloading XMLs"): # for xml in get_xml_trees("pgserver", "work", i): # p = convert.from_site(xml) # convert.passage2file(p, "downloaded/" + p.ID + ".xml") if __name__ == "__main__": t = tqdm(globals()[sys.argv[1]]("pgserver", "work", *sys.argv[2:]), unit=" passages", desc="Downloading XMLs") for xml in t: p = convert.from_site(xml) t.set_postfix(ID=p.ID) convert.passage2file(p, p.ID + ".xml")
def test_to_site(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
def test_from_standard(self): passage = convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")) ref = convert.from_site(TestUtil.load_xml("test_files/site3.xml")) self.assertTrue(passage.equals(ref, ordered=True))
def test_to_standard(): passage = convert.from_site(load_xml("test_files/site3.xml")) ref = load_xml("test_files/standard3.xml") root = convert.to_standard(passage) assert ETree.tostring(ref) == ETree.tostring(root)
def main(): print( align.align("what has is by the meaning of the word is", "what is the men for the wk is are be")) # read xml files print("reading db xmls") p = [] for filename in filenames: with open(add_path(filename), "rb") as fl: p += pickle.load(fl)[0] print( "read ", filename, " it starts with ", tuple(term.text for term in textutil.extract_terminals( convert.from_site(p[-1]))[:6])) # convert xml to passages p = list(map(convert.from_site, p)) print("reading passage xmls") # read passage files for filename in passage_filenames: print("reading" + filename) if os.path.isfile(add_path(os.path.splitext(filename)[0] + ".pkl")): with open(add_path(os.path.splitext(filename)[0] + ".pkl"), "rb") as fl: p.append(pickle.load(fl)) else: p.append(file2passage(add_path(filename))) with open(add_path(os.path.splitext(filename)[0] + ".pkl"), "wb") as fl: pickle.dump(p[-1], fl) print("dumping", add_path(os.path.splitext(filename)[0] + ".pkl")) all_filenames = filenames + passage_filenames print("read ", all_filenames) word2word = align.align_yields(p[0], p[1]) assert align.reverse_mapping(word2word) == align.align_yields( p[1], p[0]), "align_yields asymmetrical" # create symmilarity matrix sources = [] goals = [] names = [] i = 0 while i < len(p): names.append(all_filenames[i]) sources.append(p[i]) i += 1 goals.append(p[i]) i += 1 chunksize = 1 if (len(goals) > 100): chunksize = int(len(goals) / POOL_SIZE / 10) print("multithreading with chunksize", chunksize) pool = Pool(POOL_SIZE) if r2s: results = pool.starmap(distances, zip(goals, sources, names), chunksize) else: results = pool.starmap(distances, zip(sources, goals, names), chunksize) print(results) pool.close() pool.join() sym_mat = [] keys = [] for row, key in results: keys.append(key) sym_mat.append(row) print("functions and matrix") print(funcs + keys) for item in sym_mat: print(item) print("overall token analysis") print(align.token_level_analysis(p)) output_path = trial_name + "output.csv" with open(output_path, "w") as f: print("writing output to " + output_path) writer = csv.writer(f) writer.writerows(sym_mat) send_mail("*****@*****.**", "finished", os.path.abspath(output_path)) return
def test_to_standard(self): passage = convert.from_site(self._load_xml('./site3.xml')) ref = self._load_xml('./standard3.xml') root = convert.to_standard(passage) self.assertEqual(ETree.tostring(ref), ETree.tostring(root))