Beispiel #1
0
def corenlp_to_ttl(cli, args):
    print("Core NLP output file: {}".format(args.input))
    print("TTL file: {}".format(args.output))
    print("Source (raw) file: {}".format(args.raw))
    cn_sents = json.loads(chio.read_file(args.input))['sentences']
    print("Found {} core-nlp sents".format(len(cn_sents)))
    raw_sents = chio.read_file(args.raw).splitlines()
    _writer = get_ttl_writer(args.output, ttl_format=args.ttl_format, id_seed=args.seed)
    for sent_text, cn_sent in zip(raw_sents, cn_sents):
        ttl_sent = ttl.Sentence(sent_text)
        ttl_sent.tokens = (cn_tk['originalText'] for cn_tk in cn_sent['tokens'])
        for ttl_tk, cn_tk in zip(ttl_sent, cn_sent['tokens']):
            if 'lemma' in cn_tk:
                ttl_tk.lemma = cn_tk['lemma']
            if 'pos' in cn_tk:
                ttl_tk.pos = cn_tk['pos']
        _writer.write_sent(ttl_sent)
    print("{} sentences was written to {}".format(len(raw_sents), args.output))
Beispiel #2
0
def _ensure_config(config_path='~/.jamdict/config.json', mkdir=True):
    _path = Path(os.path.expanduser(config_path))
    # auto create config dir
    if mkdir:
        _path.parent.mkdir(exist_ok=True)
    if not _path.exists():
        default_config = read_file(CONFIG_TEMPLATE)
        logging.getLogger(__name__).warning(
            f"Jamdict configuration file could not be found. A new configuration file will be generated at {_path}"
        )
        logging.getLogger(__name__).debug(f"Default config: {default_config}")
        write_file(_path, default_config)
Beispiel #3
0
 def test_io_with_pathlib(self):
     print("Make sure that io functions works with pathlib.Path")
     # test read & write TXT
     data = [['name', 'foo'], ['age', '18']]
     json_path = Path(TEST_DATA) / 'temp.json'
     chio.write_file(json_path, json.dumps(data))
     json_data = json.loads(chio.read_file(json_path))
     self.assertEqual(json_data, data)
     # test read & write CSV
     filepath = Path(TEST_DATA) / 'temp.csv'
     chio.write_tsv(filepath, data)
     actual = chio.read_tsv(filepath)
     self.assertEqual(actual, data)
Beispiel #4
0
def txt_to_ttl(cli, args):
    print("Input file: {}".format(args.input))
    print("TTL/{} output: {}".format(args.ttl_format, args.output))
    print("With ID column: {}".format(args.with_idcolumn))
    raw_sents = chio.read_file(args.input).splitlines()
    _writer = get_ttl_writer(args.output, ttl_format=args.ttl_format, id_seed=args.seed)
    for sent in raw_sents:
        if args.with_idcolumn:
            sid, text = sent.split('\t', maxsplit=1)
            _writer.write_sent(ttl.Sentence(text=text, ID=sid))
        else:
            _writer.write_sent(ttl.Sentence(text=text))
    print("Written {} sentences to {}".format(len(raw_sents), args.output))
Beispiel #5
0
 def test_io_with_pathlib(self):
     print("Make sure that io functions works with pathlib.Path")
     # test read & write TXT
     data = [['name', 'foo'], ['age', '18']]
     json_path = Path(TEST_DATA) / 'temp.json'
     chio.write_file(json_path, json.dumps(data))
     json_data = json.loads(chio.read_file(json_path))
     self.assertEqual(json_data, data)
     # test read & write CSV
     filepath = Path(TEST_DATA) / 'temp.csv'
     chio.write_tsv(filepath, data)
     actual = chio.read_tsv(filepath)
     self.assertEqual(actual, data)
Beispiel #6
0
def read_config():
    if not __app_config.config and not __app_config.locate_config():
        # need to create a config
        config_dir = os.path.expanduser('~/.jamdict/')
        if not os.path.exists(config_dir):
            os.makedirs(config_dir)
        cfg_loc = os.path.join(config_dir, 'config.json')
        default_config = read_file(CONFIG_TEMPLATE)
        getLogger().warning("Jamdict configuration file could not be found. A new configuration file will be generated at {}".format(cfg_loc))
        getLogger().debug("Default config: {}".format(default_config))
        write_file(cfg_loc, default_config)
    # read config
    config = __app_config.config
    return config
Beispiel #7
0
 def _build_krad_map(self):
     with self.lock:
         lines = chio.read_file(KRADFILE, mode='rt').splitlines()
         # build the krad map
         self.__krad_map = {}
         self.__radk_map = dd(set)
         for line in lines:
             if line.startswith("#"):
                 continue
             else:
                 parts = line.split(':', maxsplit=1)
                 if len(parts) == 2:
                     rads = [r.strip() for r in parts[1].split()]
                     char_literal = parts[0].strip()
                     self.__krad_map[char_literal] = rads
                     for rad in rads:
                         self.__radk_map[rad].add(char_literal)
Beispiel #8
0
def patch_sids(cli, args):
    # rp = TextReport(args.output) if args.output else TextReport()
    if args.gold:
        print("Gold MRS file: {}".format(args.gold))
        sent_ids = []
        if args.idfile:
            print("ID file: {}".format(args.idfile))
            idlines = chio.read_file(args.idfile).splitlines()
            for line in idlines:
                idx, text = line.split('\t', maxsplit=1)
                sent_ids.append((idx, text))
            print("Found {} sentences in ID file".format(len(sent_ids)))
        sents = Document.from_file(args.gold)
        if sent_ids:
            if len(sent_ids) != len(sents):
                print("Wrong sent ID files - Found ID: {} | Found MRS: {}".format(len(sent_ids), len(sents)))
            print("Verifying sentences' text")
            for ((sid, stext), mrs_sent) in zip(sent_ids, sents):
                if stext and stext != mrs_sent.text:
                    print("Invalid sentence text: sentID: {} | {} <> {}".format(sid, stext, mrs_sent.text))
                    exit()
            print("Sentences are verified, proceed to patch sent idents")
            for ((sid, stext), mrs_sent) in zip(sent_ids, sents):
                mrs_sent.ident = sid
                if args.both:
                    mrs_sent.ID = sid
        else:
            patch_gold_sid(sents)

        if args.output:
            print("Sentence idents are patched, writing to output XML file to: {}...".format(args.output))
            chio.write_file(args.output, sents.to_xml_str())
        else:
            print(sents.to_xml_str())
        print("Done")
    else:
        print("No document to patch")
Beispiel #9
0
 def test_file_rw(self):
     tmpfile = os.path.join(TEST_DATA, 'test.txt')
     tmpgzfile = os.path.join(TEST_DATA, 'test.txt.gz')
     txt = 'ユニコード大丈夫だよ。'
     txtz = 'This is a zipped text file.'
     chio.write_file(content=txt, mode='wb', path=tmpfile)  # write content as bytes
     chio.write_file(tmpgzfile, content=txtz)
     # ensure that tmpgzfile is actually a gzip file
     with gzip.open(tmpgzfile, mode='rt') as infile:
         self.assertEqual(infile.read(), txtz)
     # verify written content
     self.assertTrue(chio.is_file(tmpfile))
     self.assertTrue(chio.is_file(tmpgzfile))
     self.assertEqual(chio.read_file(tmpfile), txt)
     self.assertEqual(chio.read_file(tmpgzfile), txtz)
     self.assertEqual(chio.read_file(tmpfile, mode='r'), txt)
     self.assertEqual(chio.read_file(tmpgzfile, mode='r'), txtz)
     self.assertIsInstance(chio.read_file(tmpfile, mode='rb'), bytes)
     self.assertIsInstance(chio.read_file(tmpgzfile, mode='rb'), bytes)
Beispiel #10
0
 def test_file_rw(self):
     tmpfile = os.path.join(TEST_DATA, 'test.txt')
     tmpgzfile = os.path.join(TEST_DATA, 'test.txt.gz')
     txt = 'ユニコード大丈夫だよ。'
     txtz = 'This is a zipped text file.'
     chio.write_file(content=txt, mode='wb',
                     path=tmpfile)  # write content as bytes
     chio.write_file(tmpgzfile, content=txtz)
     # ensure that tmpgzfile is actually a gzip file
     with gzip.open(tmpgzfile, mode='rt') as infile:
         self.assertEqual(infile.read(), txtz)
     # verify written content
     self.assertTrue(chio.is_file(tmpfile))
     self.assertTrue(chio.is_file(tmpgzfile))
     self.assertEqual(chio.read_file(tmpfile), txt)
     self.assertEqual(chio.read_file(tmpgzfile), txtz)
     self.assertEqual(chio.read_file(tmpfile, mode='r'), txt)
     self.assertEqual(chio.read_file(tmpgzfile, mode='r'), txtz)
     self.assertIsInstance(chio.read_file(tmpfile, mode='rb'), bytes)
     self.assertIsInstance(chio.read_file(tmpgzfile, mode='rb'), bytes)
Beispiel #11
0
def ukb_to_ttl(cli, args):
    ''' Convert UKB output to TTL '''
    doc = read_ttl(args.ttl, ttl_format=args.ttl_format)
    print("Source TTL file: {} | Sentences: {}".format(args.ttl, len(doc)))
    token_map = {}
    if args.tokens:
        # token file is provided
        tokens = [list(int(x) for x in line) for line in chio.read_tsv(args.tokens)]
        for sid, wid, cfrom, cto in tokens:
            token_map[(sid, wid)] = (cfrom, cto)
        print("Found tokens: {}".format(len(token_map)))
    c = Counter()
    sids = set()
    input_sids = {int(s.ID) for s in doc}
    for line_idx, line in enumerate(chio.read_file(args.input).splitlines()):
        if line.startswith('!! '):
            continue
        parts = line.split()
        if len(parts) != 5:
            print("WARNING: Invalid line -> {}: {}".format(line_idx, line))
            continue
        sid_text, wid_text, synsetid, unknown, lemma = line.split()
        sid = int(sid_text)
        wid = int(wid_text[1:])
        sent_obj = doc.get(sid, default=None)
        if sent_obj is None:
            print("SID #{} could not be found".format(sid))
        elif not token_map and wid >= len(sent_obj):
            print("Invalid wid: line#{} - sent#{} - wid#{}".format(line_idx, sid, wid))
        else:
            # now can tag ...
            # remove current concepts if needed
            # if args.removetags:
            #     cids = list(c.cidx for c in sent_obj.concepts)
            #     for cid in cids:
            #         sent_obj.pop_concept(cid)
            if not token_map:
                token = sent_obj[wid]
                # double check token text
                if lemma != token.text.lower() and lemma != token.lemma.lower():
                    print("Invalid token text: {} <> {}/{}".format(lemma, token.text.lower(), token.lemma.lower()))
                sent_obj.new_concept(synsetid, lemma, tokens=[wid])
            else:
                # create sentence-level tag instead
                cfrom, cto = token_map[(sid, wid)]
                sent_obj.new_tag(synsetid, cfrom, cto, tagtype='WN')
            c.count("Tokens")
            sids.add(sid)
    print("UKB sentences: {}".format(len(sids)))
    print("Not found: {}".format(input_sids.difference(sids)))
    c.summarise()
    # removetags if needed
    if args.removetags:
        for sent_obj in doc:
            sent_obj.tags.clear()
    print("Sent #1 tags: {}".format(len(doc[0].tags)))
    # baking
    if not args.tokens:
        print("Now baking to tags ...")
        bake_doc(doc)
    else:
        print("WARNING: Because token file was provided, no auto-baking will be done")
    print("Sent #1 tags after baking: {}".format(len(doc[0].tags)))
    # Now output ...
    if args.output:
        print("Output to file ...")
        _writer = get_ttl_writer(args.output, ttl_format=args.ttl_format, id_seed=args.seed)
        for sent in doc:
            _writer.write_sent(sent)
        print("Written {} sentences to {}".format(len(doc), args.output))
    print("Done")