def fix_tokenization(passage, words_set, lang, cw): tokenizer = get_tokenizer(lang=lang) elem = to_site(passage) state = State() ever_changed = False for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs): while True: changed = False terminals = list(paragraph.iter(SiteCfg.Tags.Terminal)) preterminals = get_parents(paragraph, terminals) preterminal_parents = get_parents(paragraph, preterminals) is_puncts = [ p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct for p in preterminals ] for i in false_indices(is_puncts): start, end = expand_to_neighboring_punct(i, is_puncts) if retokenize(i, start, end, terminals, preterminals, preterminal_parents, passage.ID, tokenizer, state, cw, words_set): ever_changed = changed = True break if not changed: break return from_site(elem) if ever_changed else None
def main(args): os.makedirs(args.outdir, exist_ok=True) for passage in get_passages_with_progress_bar(args.filenames): site_filename = os.path.join(args.outdir, passage.ID + ".xml") with open(site_filename, "w", encoding="utf-8") as f: print(tostring(convert.to_site(passage)).decode(), file=f) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename)
def main(args): os.makedirs(args.outdir, exist_ok=True) for passage in get_passages_with_progress_bar(args.filenames): site_filename = os.path.join(args.outdir, passage.ID + ".xml") with open(site_filename, "w", encoding="utf-8") as f: print(tostring(convert.to_site(passage)).decode(), file=f) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename)
def main(args): filenames = list(args.passages) if args.filenames: with open(args.filenames, encoding="utf-8") as f: filenames += list(filter(None, map(str.strip, f))) with open(args.out, "w", encoding="utf-8") as f: for passage in get_passages_with_progress_bar(filenames): out = upload_passage(convert.to_site(passage), verbose=args.verbose, site_filename=passage.ID + "_site_upload.xml" if args.write_site else None, db_name=args.db_name, host_name=args.host_name, new_pid=passage.ID, new_prid=args.project_id, username=args.username) print(passage.ID, out, file=f) if args.verbose: print("Uploaded passage %s with xid=%s" % (passage.ID, out)) if CONNECTION is not None: CONNECTION.commit() print("Wrote '%s'" % args.out)
def fix_tokenization(passage, words_set, lang, cw): tokenizer = get_tokenizer(lang=lang) elem = to_site(passage) state = State() ever_changed = False for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs): while True: changed = False terminals = list(paragraph.iter(SiteCfg.Tags.Terminal)) preterminals = get_parents(paragraph, terminals) preterminal_parents = get_parents(paragraph, preterminals) is_puncts = [p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct for p in preterminals] for i in false_indices(is_puncts): start, end = expand_to_neighboring_punct(i, is_puncts) if retokenize(i, start, end, terminals, preterminals, preterminal_parents, passage.ID, tokenizer, state, cw, words_set): ever_changed = changed = True break if not changed: break return from_site(elem) if ever_changed else None
def test_to_site(): passage = loaded() root = convert.to_site(passage) copy = convert.from_site(root) assert passage.equals(copy)
def test_to_site(self): passage = convert.from_standard(self._load_xml('./standard3.xml')) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
def test_to_site(self): passage = convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
#! /usr/bin/python3 from ucca import convert import sys from xml.etree.ElementTree import ElementTree, tostring, fromstring def file2passage(filename): "Opens a standard xml file and returns its parsed Passage object" with open(filename) as f: etree = ElementTree().parse(f) return convert.from_standard(etree) if len(sys.argv) != 3: print('Usage: convert_standard_to_site <input filename> <output filename>') sys.exit(-1) P = file2passage(sys.argv[1]) output = tostring(convert.to_site(P)).decode() with open(sys.argv[2], 'w') as outf: outf.write(output) sys.exit(0)
def test_to_site(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))