def convert(wng_loc, wng_db_loc, createdb): ''' Convert Gloss WordNet into SQLite ''' merged_folder = os.path.join(wng_loc, 'merged') print("Path to glosstag folder: %s" % (merged_folder)) print("Path to output database: %s" % (wng_db_loc)) print("Script to execute: %s" % (DB_INIT_SCRIPT)) if os.path.isfile(wng_db_loc): print("DB file exists (%s | size: %s)" % (wng_db_loc,os.path.getsize(wng_db_loc))) answer = input("If you want to overwrite this file, please type CONFIRM: ") if answer != "CONFIRM": print("Script aborted!") exit() db = SQLiteGWordNet(wng_db_loc) if createdb: header('Preparing database file ...') db.setup(DB_INIT_SCRIPT) #-- xmlfiles = [ #os.path.join(merged_folder, 'test.xml') os.path.join(merged_folder, 'adv.xml') ,os.path.join(merged_folder, 'adj.xml') ,os.path.join(merged_folder, 'verb.xml') ,os.path.join(merged_folder, 'noun.xml') ] header('Importing data from XML to SQLite') xml2db(xmlfiles, db) pass
def xml2db(xml_files, db): ''' Convert a XML file of Gloss WordNet into SQLite ''' t = Timer() header("Extracting Gloss WordNet (XML)") xmlgwn = XMLGWordNet() for xml_file in xml_files: t.start('Reading file: %s' % xml_file) xmlgwn.read(xml_file) t.end("Extraction completed %s" % xml_file) header("Inserting data into đáng SQLite database") t.start() db.insert_synsets(xmlgwn.synsets) t.end('Insertion completed.') pass
def main(): print("Script to compare WNSQL30 to WN-NTUMC") wnntu = WordNetNTUMC(WN_NTUMC_FILE) wn30 = WordNetSQL(WORDNET_30_PATH) header("WordNet-NTUMC") ssntu = wnntu.get_all_synsets() print("Synset count: %s " % (len(ssntu),)) for ss in ssntu[:5]: print(ss) sidntu = set([ ss.synset if not ss.synset.endswith('r') else ss.synset[:-1] + 'a' for ss in ssntu ]) header("WordNet SQL 3.0") sensemap = wn30.all_senses() sswn30 = [] wn30sensemap = {} for sses in sensemap.values(): for ss in sses: sswn30.append(ss) wn30sensemap[ss.get_canonical_synsetid()] = ss print("Synset count: %s " % (len(sswn30),)) for ss in sswn30[:5]: print( "%s: %s" % (ss.get_canonical_synsetid(), wn30.get_senseinfo_by_sid(ss.sid),) ) sidwn30 = set([ ss.get_canonical_synsetid() for ss in sswn30 ]) header("synsets in WNNTUMC but not in WNSQL30") sids = sidntu.difference(sidwn30) print(len(sids)) with open(NTUMC_NEW_SYNSETS, 'w') as ntuout: for sid in sids: ntuout.write("%s: %s\n" % (sid, ' | '.join([ x._2 for x in wnntu.get_synset_def(sid) ]))) header("synsets in WNSQL30 but not in WNNTUMC") sids = sidwn30.difference(sidntu) print(len(sids)) with open(WN30_NEW_SYNSETS, 'w') as wn30out: for sid in sids: wn30out.write("%s: %s\n" % (sid, wn30sensemap[sid].gloss)) pass
def convert(args): ''' Convert Gloss WordNet XML into SQLite format ''' show_info(args) if os.path.isfile(args.glossdb) and os.path.getsize(args.glossdb) > 0: print("DB file exists (%s | size: %s)" % (args.glossdb, os.path.getsize(args.glossdb))) answer = input( "If you want to overwrite this file, please type CONFIRM: ") if answer != "CONFIRM": print("Script aborted!") exit() db = get_gwn(args) header('Importing data from XML to SQLite') t = Timer() header("Extracting Gloss WordNet (XML)") xmlgwn = get_gwnxml(args) header("Inserting data into SQLite database") t.start() db.insert_synsets(xmlgwn.synsets) t.end('Insertion completed.') pass