예제 #1
0
파일: wntk.py 프로젝트: letuananh/yawlib
def convert(wng_loc, wng_db_loc, createdb):
    ''' Convert Gloss WordNet into SQLite
    '''
    merged_folder = os.path.join(wng_loc, 'merged')
    
    print("Path to glosstag folder: %s" % (merged_folder))
    print("Path to output database: %s" % (wng_db_loc))
    print("Script to execute: %s" % (DB_INIT_SCRIPT))

    if os.path.isfile(wng_db_loc):
        print("DB file exists (%s | size: %s)" % (wng_db_loc,os.path.getsize(wng_db_loc)))
        answer = input("If you want to overwrite this file, please type CONFIRM: ")
        if answer != "CONFIRM":
            print("Script aborted!")
            exit()

    db = SQLiteGWordNet(wng_db_loc)
    if createdb:
        header('Preparing database file ...')
        db.setup(DB_INIT_SCRIPT)
    #--
    xmlfiles = [
        #os.path.join(merged_folder, 'test.xml')
        os.path.join(merged_folder, 'adv.xml')
        ,os.path.join(merged_folder, 'adj.xml')
        ,os.path.join(merged_folder, 'verb.xml')
        ,os.path.join(merged_folder, 'noun.xml')
    ]
    header('Importing data from XML to SQLite')
    xml2db(xmlfiles, db)
    pass
예제 #2
0
파일: wntk.py 프로젝트: letuananh/yawlib
def xml2db(xml_files, db):
    ''' Convert a XML file of Gloss WordNet into SQLite
    '''
    t = Timer()

    header("Extracting Gloss WordNet (XML)")
    xmlgwn = XMLGWordNet()
    for xml_file in xml_files:
        t.start('Reading file: %s' % xml_file)
        xmlgwn.read(xml_file)
        t.end("Extraction completed %s" % xml_file)

    header("Inserting data into đáng SQLite database")
    t.start()
    db.insert_synsets(xmlgwn.synsets)
    t.end('Insertion completed.')
    pass
예제 #3
0
def main():
    print("Script to compare WNSQL30 to WN-NTUMC")
    wnntu = WordNetNTUMC(WN_NTUMC_FILE)
    wn30 = WordNetSQL(WORDNET_30_PATH)
    
    header("WordNet-NTUMC")
    ssntu = wnntu.get_all_synsets()
    print("Synset count: %s " % (len(ssntu),))
    for ss in ssntu[:5]:
        print(ss)
    sidntu = set([ ss.synset if not ss.synset.endswith('r') else ss.synset[:-1] + 'a' for ss in ssntu ])
    
    
    header("WordNet SQL 3.0")
    sensemap = wn30.all_senses()
    sswn30 = []
    wn30sensemap = {}
    for sses in sensemap.values():
        for ss in sses:
            sswn30.append(ss)
            wn30sensemap[ss.get_canonical_synsetid()] = ss
    print("Synset count: %s " % (len(sswn30),))
    for ss in sswn30[:5]:
        print( "%s: %s" % (ss.get_canonical_synsetid(), wn30.get_senseinfo_by_sid(ss.sid),) )
    sidwn30 = set([ ss.get_canonical_synsetid() for ss in sswn30 ])
    
    header("synsets in WNNTUMC but not in WNSQL30")
    sids = sidntu.difference(sidwn30)
    print(len(sids))
    with open(NTUMC_NEW_SYNSETS, 'w') as ntuout:
        for sid in sids:
            ntuout.write("%s: %s\n" % (sid, ' | '.join([ x._2 for x in wnntu.get_synset_def(sid) ])))
    
    header("synsets in WNSQL30 but not in WNNTUMC")
    sids = sidwn30.difference(sidntu)
    print(len(sids))
    with open(WN30_NEW_SYNSETS, 'w') as wn30out:
        for sid in sids:
            wn30out.write("%s: %s\n" % (sid, wn30sensemap[sid].gloss))
    pass
예제 #4
0
def convert(args):
    ''' Convert Gloss WordNet XML into SQLite format
    '''
    show_info(args)

    if os.path.isfile(args.glossdb) and os.path.getsize(args.glossdb) > 0:
        print("DB file exists (%s | size: %s)" %
              (args.glossdb, os.path.getsize(args.glossdb)))
        answer = input(
            "If you want to overwrite this file, please type CONFIRM: ")
        if answer != "CONFIRM":
            print("Script aborted!")
            exit()
    db = get_gwn(args)
    header('Importing data from XML to SQLite')
    t = Timer()
    header("Extracting Gloss WordNet (XML)")
    xmlgwn = get_gwnxml(args)
    header("Inserting data into SQLite database")
    t.start()
    db.insert_synsets(xmlgwn.synsets)
    t.end('Insertion completed.')
    pass