コード例 #1
0
def fix_tokenization(passage, words_set, lang, cw):
    tokenizer = get_tokenizer(lang=lang)
    elem = to_site(passage)
    state = State()
    ever_changed = False
    for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs):
        while True:
            changed = False
            terminals = list(paragraph.iter(SiteCfg.Tags.Terminal))
            preterminals = get_parents(paragraph, terminals)
            preterminal_parents = get_parents(paragraph, preterminals)
            is_puncts = [
                p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct
                for p in preterminals
            ]
            for i in false_indices(is_puncts):
                start, end = expand_to_neighboring_punct(i, is_puncts)
                if retokenize(i, start, end, terminals, preterminals,
                              preterminal_parents, passage.ID, tokenizer,
                              state, cw, words_set):
                    ever_changed = changed = True
                    break
            if not changed:
                break
    return from_site(elem) if ever_changed else None
コード例 #2
0
ファイル: standard_to_site.py プロジェクト: danielhers/ucca
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    for passage in get_passages_with_progress_bar(args.filenames):
        site_filename = os.path.join(args.outdir, passage.ID + ".xml")
        with open(site_filename, "w", encoding="utf-8") as f:
            print(tostring(convert.to_site(passage)).decode(), file=f)
        if args.verbose:
            with external_write_mode():
                print("Wrote '%s'" % site_filename)
コード例 #3
0
ファイル: standard_to_site.py プロジェクト: shachardon/ucca
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    for passage in get_passages_with_progress_bar(args.filenames):
        site_filename = os.path.join(args.outdir, passage.ID + ".xml")
        with open(site_filename, "w", encoding="utf-8") as f:
            print(tostring(convert.to_site(passage)).decode(), file=f)
        if args.verbose:
            with external_write_mode():
                print("Wrote '%s'" % site_filename)
コード例 #4
0
ファイル: upload.py プロジェクト: ruixiangcui/ucca
def main(args):
    filenames = list(args.passages)
    if args.filenames:
        with open(args.filenames, encoding="utf-8") as f:
            filenames += list(filter(None, map(str.strip, f)))
    with open(args.out, "w", encoding="utf-8") as f:
        for passage in get_passages_with_progress_bar(filenames):
            out = upload_passage(convert.to_site(passage), verbose=args.verbose,
                                 site_filename=passage.ID + "_site_upload.xml" if args.write_site else None,
                                 db_name=args.db_name, host_name=args.host_name,
                                 new_pid=passage.ID, new_prid=args.project_id, username=args.username)
            print(passage.ID, out, file=f)
            if args.verbose:
                print("Uploaded passage %s with xid=%s" % (passage.ID, out))
    if CONNECTION is not None:
        CONNECTION.commit()
    print("Wrote '%s'" % args.out)
コード例 #5
0
ファイル: fix_tokenization.py プロジェクト: danielhers/ucca
def fix_tokenization(passage, words_set, lang, cw):
    tokenizer = get_tokenizer(lang=lang)
    elem = to_site(passage)
    state = State()
    ever_changed = False
    for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs):
        while True:
            changed = False
            terminals = list(paragraph.iter(SiteCfg.Tags.Terminal))
            preterminals = get_parents(paragraph, terminals)
            preterminal_parents = get_parents(paragraph, preterminals)
            is_puncts = [p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct for p in preterminals]
            for i in false_indices(is_puncts):
                start, end = expand_to_neighboring_punct(i, is_puncts)
                if retokenize(i, start, end, terminals, preterminals,
                              preterminal_parents, passage.ID, tokenizer,
                              state,
                              cw, words_set):
                    ever_changed = changed = True
                    break
            if not changed:
                break
    return from_site(elem) if ever_changed else None
コード例 #6
0
def test_to_site():
    passage = loaded()
    root = convert.to_site(passage)
    copy = convert.from_site(root)
    assert passage.equals(copy)
コード例 #7
0
ファイル: test_ucca_ut.py プロジェクト: amitbeka/ucca
 def test_to_site(self):
     passage = convert.from_standard(self._load_xml('./standard3.xml'))
     root = convert.to_site(passage)
     copy = convert.from_site(root)
     self.assertTrue(passage.equals(copy))
コード例 #8
0
ファイル: test_ucca.py プロジェクト: macleginn/ucca
 def test_to_site(self):
     passage = convert.from_standard(
         TestUtil.load_xml("test_files/standard3.xml"))
     root = convert.to_site(passage)
     copy = convert.from_site(root)
     self.assertTrue(passage.equals(copy))
コード例 #9
0
#! /usr/bin/python3
from ucca import convert
import sys
from xml.etree.ElementTree import ElementTree, tostring, fromstring

def file2passage(filename):
    "Opens a standard xml file and returns its parsed Passage object"
    with open(filename) as f:
        etree = ElementTree().parse(f)
    return convert.from_standard(etree)

if len(sys.argv) != 3:
    print('Usage: convert_standard_to_site <input filename> <output filename>')
    sys.exit(-1)

P = file2passage(sys.argv[1])
output = tostring(convert.to_site(P)).decode()
with open(sys.argv[2], 'w') as outf:
    outf.write(output)

sys.exit(0)
コード例 #10
0
ファイル: test_ucca.py プロジェクト: aiedward/nn4nlp-code
 def test_to_site(self):
     passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml"))
     root = convert.to_site(passage)
     copy = convert.from_site(root)
     self.assertTrue(passage.equals(copy))