コード例 #1
0
ファイル: extractor.py プロジェクト: hoangducchinh/yawlib
def export_wnsql_synsets(args):
    print(
        "Exporting synsets' info (lemmas/defs/examples) from WordnetSQL (Princeton Wordnet 3.0) to text file"
    )
    show_info(args)
    output_with_sid_file = os.path.abspath('./data/wn30_lemmas.txt')
    output_without_sid_file = os.path.abspath('./data/wn30_lemmas_noss.txt')
    output_defs = os.path.abspath('./data/wn30_defs.txt')
    output_exes = os.path.abspath('./data/wn30_exes.txt')
    wn = get_wn(args)
    # Extract lemmas
    records = wn.get_all_synsets()
    synsets_lemmas = []
    for r in records:
        synsets_lemmas.append(
            (SynsetID.from_string(str(r.synsetid)).to_canonical(), r.lemma))
    synsets_lemmas.sort(key=itemgetter(0, 1))
    with open(output_with_sid_file,
              'w') as with_sid, open(output_without_sid_file,
                                     'w') as without_sid:
        for row in synsets_lemmas:
            with_sid.write('%s\t%s\n' % row)
            without_sid.write('%s\n' % (row[1], ))  # just the lemma

    # Extract synset definitions
    records = wn.schema.ss.select(orderby='synsetid')
    synsets_defs = []
    for r in records:
        synsets_defs.append(
            (SynsetID.from_string(r.synsetid).to_canonical(), r.definition))
    synsets_defs.sort(key=itemgetter(0))
    with open(output_defs, 'w') as def_file:
        for row in synsets_defs:
            def_file.write('%s\t%s\n' % row)

    # Extract examples
    records = wn.schema.ex.select(orderby='synsetid')
    synsets_examples = []
    for r in records:
        synsets_examples.append(
            (SynsetID.from_string(r.synsetid).to_canonical(), r.sample))
    synsets_examples.sort(key=itemgetter(0))
    with open(output_exes, 'w') as ex_file:
        for row in synsets_examples:
            ex_file.write('%s\t%s\n' % row)

    # summary
    print("Data has been extracted to:")
    print("  + {}".format(output_with_sid_file))
    print("  + {}".format(output_without_sid_file))
    print("  + {}".format(output_defs))
    print("  + {}".format(output_exes))
    print("Done!")
コード例 #2
0
ファイル: extractor.py プロジェクト: hoangducchinh/yawlib
def export_gwnsql_synsets(args):
    print(
        "Exporting synsets' info (lemmas/defs/examples) from GlossWordNet (SQLite) to text file"
    )
    show_info(args)
    output_with_sid_file = os.path.abspath('./data/glosstag_lemmas.txt')
    output_without_sid_file = os.path.abspath(
        './data/glosstag_lemmas_noss.txt')
    output_defs = os.path.abspath('./data/glosstag_defs.txt')
    output_exes = os.path.abspath('./data/glosstag_exes.txt')
    gwn = get_gwn(args)

    # Extract synsets' lemmas, definitions and examples
    if args.mockup:
        synsets = get_gwnxml(args).synsets
    else:
        synsets = gwn.all_synsets()

    synsets.synsets.sort(key=lambda x: x.sid.to_canonical())
    with open(output_defs,
              'w') as def_file, open(output_exes, 'w') as ex_file, open(
                  output_with_sid_file,
                  'w') as with_sid, open(output_without_sid_file,
                                         'w') as without_sid:
        # synsets = gwn.get_synsets_by_ids(['01828736-v', '00001740-r'])
        for ss in synsets:
            for t in sorted(ss.terms, key=lambda x: x.term):
                with_sid.write('%s\t%s\n' % (ss.sid.to_canonical(), t.term))
                without_sid.write('%s\n' % (t.term, ))
            for gloss in ss.glosses:
                if gloss.cat == 'def':
                    def_file.write('{sid}\t{d}\n'.format(sid=ss.sid,
                                                         d=gloss.text()))
                elif gloss.cat == 'ex':
                    ex_file.write('{sid}\t{ex}\n'.format(sid=ss.sid,
                                                         ex=gloss.text()))
    # summary
    print("Data has been extracted to:")
    print("  + {}".format(output_with_sid_file))
    print("  + {}".format(output_without_sid_file))
    print("  + {}".format(output_defs))
    print("  + {}".format(output_exes))
    print("Extracted synsets: {}".format(len(synsets)))
    print("Done!")
コード例 #3
0
ファイル: extractor.py プロジェクト: hoangducchinh/yawlib
def glosstag2ntumc(args):
    print("Extracting Glosstag to NTU-MC")
    show_info(args)
    print("To be developed")
    pass