def read_nttat(cli, args): ''' Convert NTTAT patch to JSON ''' stdout = TextReport() ext = 'json' rp = TextReport("{}_1.{}".format(args.output, ext)) rp2 = TextReport("{}_2.{}".format(args.output, ext)) gwn = get_gwn() data = [] with open(args.input, 'r') as infile, gwn.ctx() as ctx: ssids = re.findall('\d{8}-[nvarx]', infile.read()) print(len(ssids)) print(ssids) for sid in ssids: ss = gwn.get_synset(sid, ctx=ctx) sdef = fix_gwn_def(ss.definition) stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas))) stdout.print(sdef) data.append({ "synset": sid, "lemmas": ss.lemmas, "definition": sdef }) cut = int(len(data) / 2) # first half first_half = json.dumps(data[:cut], indent=2) rp.write(first_half) # second half second_half = json.dumps(data[cut:], indent=2) rp2.write(second_half)
def test_get_gloss_synsets(self): print("Test get glossed synset(s)") db = get_gwn() glosses = db.schema.gloss.select() # select glosses print("Gloss count: {}".format(len(glosses))) print(glosses[:5]) pass
def test_dump_synset(self): print("Test get synset by ID") gwn = get_gwn() ss = get_synset_by_id(gwn, '01775535-v') self.assertIsNotNone(ss) self.assertGreater(len(ss.lemmas), 0) self.assertGreater(len(ss.keys), 0) self.assertGreater(len(ss.glosses), 0) dump_synset(ss) pass
def export_gwnsql_synsets(args): print( "Exporting synsets' info (lemmas/defs/examples) from GlossWordNet (SQLite) to text file" ) show_info(args) output_with_sid_file = os.path.abspath('./data/glosstag_lemmas.txt') output_without_sid_file = os.path.abspath( './data/glosstag_lemmas_noss.txt') output_defs = os.path.abspath('./data/glosstag_defs.txt') output_exes = os.path.abspath('./data/glosstag_exes.txt') gwn = get_gwn(args) # Extract synsets' lemmas, definitions and examples if args.mockup: synsets = get_gwnxml(args).synsets else: synsets = gwn.all_synsets() synsets.synsets.sort(key=lambda x: x.sid.to_canonical()) with open(output_defs, 'w') as def_file, open(output_exes, 'w') as ex_file, open( output_with_sid_file, 'w') as with_sid, open(output_without_sid_file, 'w') as without_sid: # synsets = gwn.get_synsets_by_ids(['01828736-v', '00001740-r']) for ss in synsets: for t in sorted(ss.terms, key=lambda x: x.term): with_sid.write('%s\t%s\n' % (ss.sid.to_canonical(), t.term)) without_sid.write('%s\n' % (t.term, )) for gloss in ss.glosses: if gloss.cat == 'def': def_file.write('{sid}\t{d}\n'.format(sid=ss.sid, d=gloss.text())) elif gloss.cat == 'ex': ex_file.write('{sid}\t{ex}\n'.format(sid=ss.sid, ex=gloss.text())) # summary print("Data has been extracted to:") print(" + {}".format(output_with_sid_file)) print(" + {}".format(output_without_sid_file)) print(" + {}".format(output_defs)) print(" + {}".format(output_exes)) print("Extracted synsets: {}".format(len(synsets))) print("Done!")
from puchikarui import Schema, with_ctx from coolisf import GrammarHub from chirptext.leutile import grouper from chirptext.io import CSV from chirptext import TextReport, FileHelper, Counter, FileHub from chirptext.cli import CLIApp, setup_logging from yawlib.helpers import get_gwn from yawlib.helpers import get_wn, get_omw # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- DATA_FOLDER = os.path.abspath(os.path.expanduser('./data')) omw = get_omw() gwn = get_gwn() wn = get_wn() setup_logging('logging.json', 'logs') ghub = GrammarHub() MY_DIR = os.path.dirname(__file__) SETUP_FILE = os.path.join(MY_DIR, 'scripts', 'ewdb.sql') ROOTS = {'n': 'root_wn_n', 'v': 'root_wn_v', 'a': 'root_wn_adj', 'r': ''} DEFAULT_DB_PATH = FileHelper.abspath('data/ewmap.db') class EWDB(Schema): class Flags: PROCESSED = 1 NO_PARSE = 2 MWE = 3 MWE_PURE = 100
def test_get_by_sk(self): ss = get_synset_by_sk(get_gwn(), 'test%2:41:00::', report_file=self.nullrep) self.assertIsNotNone(ss)