Ejemplo n.º 1
0
def ttl_to_txt(cli, args):
    doc = read_ttl(args.path, ttl_format=args.ttl_format)
    print("In doc: {} | Sentences: {}".format(args.path, len(doc)))
    lines = [s.text for s in doc]
    if args.output:
        chio.write_file(args.output, '\n'.join(lines))
        print("Written {} lines to {}".format(len(lines), args.output))
    print("Done")
Ejemplo n.º 2
0
def _ensure_config(config_path='~/.jamdict/config.json', mkdir=True):
    _path = Path(os.path.expanduser(config_path))
    # auto create config dir
    if mkdir:
        _path.parent.mkdir(exist_ok=True)
    if not _path.exists():
        default_config = read_file(CONFIG_TEMPLATE)
        logging.getLogger(__name__).warning(
            f"Jamdict configuration file could not be found. A new configuration file will be generated at {_path}"
        )
        logging.getLogger(__name__).debug(f"Default config: {default_config}")
        write_file(_path, default_config)
Ejemplo n.º 3
0
 def test_io_with_pathlib(self):
     print("Make sure that io functions works with pathlib.Path")
     # test read & write TXT
     data = [['name', 'foo'], ['age', '18']]
     json_path = Path(TEST_DATA) / 'temp.json'
     chio.write_file(json_path, json.dumps(data))
     json_data = json.loads(chio.read_file(json_path))
     self.assertEqual(json_data, data)
     # test read & write CSV
     filepath = Path(TEST_DATA) / 'temp.csv'
     chio.write_tsv(filepath, data)
     actual = chio.read_tsv(filepath)
     self.assertEqual(actual, data)
Ejemplo n.º 4
0
 def test_io_with_pathlib(self):
     print("Make sure that io functions works with pathlib.Path")
     # test read & write TXT
     data = [['name', 'foo'], ['age', '18']]
     json_path = Path(TEST_DATA) / 'temp.json'
     chio.write_file(json_path, json.dumps(data))
     json_data = json.loads(chio.read_file(json_path))
     self.assertEqual(json_data, data)
     # test read & write CSV
     filepath = Path(TEST_DATA) / 'temp.csv'
     chio.write_tsv(filepath, data)
     actual = chio.read_tsv(filepath)
     self.assertEqual(actual, data)
Ejemplo n.º 5
0
def read_config():
    if not __app_config.config and not __app_config.locate_config():
        # need to create a config
        config_dir = os.path.expanduser('~/.jamdict/')
        if not os.path.exists(config_dir):
            os.makedirs(config_dir)
        cfg_loc = os.path.join(config_dir, 'config.json')
        default_config = read_file(CONFIG_TEMPLATE)
        getLogger().warning("Jamdict configuration file could not be found. A new configuration file will be generated at {}".format(cfg_loc))
        getLogger().debug("Default config: {}".format(default_config))
        write_file(cfg_loc, default_config)
    # read config
    config = __app_config.config
    return config
Ejemplo n.º 6
0
 def save(self,
          path,
          encoding='utf-8',
          xml_declaration=None,
          default_namespace=None,
          short_empty_elements=True,
          *args,
          **kwargs):
     """ Write ELAN Doc to an EAF file """
     _content = self.to_xml_bin(encoding=encoding,
                                xml_declaration=xml_declaration,
                                default_namespace=default_namespace,
                                short_empty_elements=short_empty_elements,
                                *args,
                                **kwargs)
     chio.write_file(path, _content, encoding=encoding)
Ejemplo n.º 7
0
 def test_file_rw(self):
     tmpfile = os.path.join(TEST_DATA, 'test.txt')
     tmpgzfile = os.path.join(TEST_DATA, 'test.txt.gz')
     txt = 'ユニコード大丈夫だよ。'
     txtz = 'This is a zipped text file.'
     chio.write_file(content=txt, mode='wb', path=tmpfile)  # write content as bytes
     chio.write_file(tmpgzfile, content=txtz)
     # ensure that tmpgzfile is actually a gzip file
     with gzip.open(tmpgzfile, mode='rt') as infile:
         self.assertEqual(infile.read(), txtz)
     # verify written content
     self.assertTrue(chio.is_file(tmpfile))
     self.assertTrue(chio.is_file(tmpgzfile))
     self.assertEqual(chio.read_file(tmpfile), txt)
     self.assertEqual(chio.read_file(tmpgzfile), txtz)
     self.assertEqual(chio.read_file(tmpfile, mode='r'), txt)
     self.assertEqual(chio.read_file(tmpgzfile, mode='r'), txtz)
     self.assertIsInstance(chio.read_file(tmpfile, mode='rb'), bytes)
     self.assertIsInstance(chio.read_file(tmpgzfile, mode='rb'), bytes)
Ejemplo n.º 8
0
 def test_file_rw(self):
     tmpfile = os.path.join(TEST_DATA, 'test.txt')
     tmpgzfile = os.path.join(TEST_DATA, 'test.txt.gz')
     txt = 'ユニコード大丈夫だよ。'
     txtz = 'This is a zipped text file.'
     chio.write_file(content=txt, mode='wb',
                     path=tmpfile)  # write content as bytes
     chio.write_file(tmpgzfile, content=txtz)
     # ensure that tmpgzfile is actually a gzip file
     with gzip.open(tmpgzfile, mode='rt') as infile:
         self.assertEqual(infile.read(), txtz)
     # verify written content
     self.assertTrue(chio.is_file(tmpfile))
     self.assertTrue(chio.is_file(tmpgzfile))
     self.assertEqual(chio.read_file(tmpfile), txt)
     self.assertEqual(chio.read_file(tmpgzfile), txtz)
     self.assertEqual(chio.read_file(tmpfile, mode='r'), txt)
     self.assertEqual(chio.read_file(tmpgzfile, mode='r'), txtz)
     self.assertIsInstance(chio.read_file(tmpfile, mode='rb'), bytes)
     self.assertIsInstance(chio.read_file(tmpgzfile, mode='rb'), bytes)
Ejemplo n.º 9
0
def patch_sids(cli, args):
    # rp = TextReport(args.output) if args.output else TextReport()
    if args.gold:
        print("Gold MRS file: {}".format(args.gold))
        sent_ids = []
        if args.idfile:
            print("ID file: {}".format(args.idfile))
            idlines = chio.read_file(args.idfile).splitlines()
            for line in idlines:
                idx, text = line.split('\t', maxsplit=1)
                sent_ids.append((idx, text))
            print("Found {} sentences in ID file".format(len(sent_ids)))
        sents = Document.from_file(args.gold)
        if sent_ids:
            if len(sent_ids) != len(sents):
                print("Wrong sent ID files - Found ID: {} | Found MRS: {}".format(len(sent_ids), len(sents)))
            print("Verifying sentences' text")
            for ((sid, stext), mrs_sent) in zip(sent_ids, sents):
                if stext and stext != mrs_sent.text:
                    print("Invalid sentence text: sentID: {} | {} <> {}".format(sid, stext, mrs_sent.text))
                    exit()
            print("Sentences are verified, proceed to patch sent idents")
            for ((sid, stext), mrs_sent) in zip(sent_ids, sents):
                mrs_sent.ident = sid
                if args.both:
                    mrs_sent.ID = sid
        else:
            patch_gold_sid(sents)

        if args.output:
            print("Sentence idents are patched, writing to output XML file to: {}...".format(args.output))
            chio.write_file(args.output, sents.to_xml_str())
        else:
            print(sents.to_xml_str())
        print("Done")
    else:
        print("No document to patch")
Ejemplo n.º 10
0
              "./test_data/processed/test.m4a")

# -----------------------------------------------------------------------------
# cutting audio file by timestamps
# -----------------------------------------------------------------------------
media.cut("./test_data/processed/test.m4a",
          "./test_data/processed/test_before_10.ogg",
          to_ts="00:00:10")
media.cut("./test_data/processed/test.m4a",
          "./test_data/processed/test_after_10.ogg",
          from_ts="00:00:15")
media.cut("./test_data/processed/test.m4a",
          "./test_data/processed/test_10-15.ogg",
          from_ts="00:00:10",
          to_ts="00:00:15")

# --------------------------------------------------------------------------------------------
# More complex use case
# Read an ELAN transcription file and:
#    1. Cut all utterances into separated ogg files
#    2. Write annotation text into separated text files
#    3. Write all utterances into a CSV file with annotation IDs and individual audio filenames
# --------------------------------------------------------------------------------------------
eaf = elan.read_eaf("./test_data/fables_01_03_aesop_64kb.eaf")
csv_rows = [["annID", "Text", "Filename"]]
for ann in eaf["Story"]:
    csv_rows.append([ann.ID, ann.text, f"test_{ann.ID}.ogg"])
    chio.write_file(f"./test_data/processed/test_{ann.ID}.txt", ann.text)
    eaf.cut(ann, f"./test_data/processed/test_{ann.ID}.ogg")
chio.write_csv("./test_data/processed/test_sentences.csv", csv_rows)