Beispiel #1
0
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    bert_vectorizer = BertVectorizer(args.bert_path)

    if 'ruwordnet_path' in args:
        ruwordnet = RuWordnet(args.ruwordnet_path, None)
        synsets = defaultdict(list)
        for sense_id, synset_id, text in ruwordnet.get_all_senses():
            if synset_id.endswith(args.pos):
                synsets[synset_id].append(text.lower())
        bert_vectorizer.vectorize_groups(synsets,
                                         args.output_path,
                                         to_upper=False)

    if 'wordnet_old' in args:
        wn_old = WordNetCorpusReader(args.wordnet_old, None)
        wn_new = WordNetCorpusReader(args.wordnet_new, None)
        synsets = compute_synsets_from_wordnets(wn_old, wn_new, args.pos)
        bert_vectorizer.vectorize_groups(synsets,
                                         args.output_path,
                                         to_upper=False)

    if "data_path" in args:
        data = read_file(args.data_path, lower=args.upper)
        bert_vectorizer.vectorize_data(data,
                                       args.output_path,
                                       upper=args.upper)
Beispiel #2
0
from nltk.corpus import WordNetCorpusReader
from fasttext_vectorize_en import compute_synsets_from_wordnets

wn2 = WordNetCorpusReader(
    'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN1.7', None)
wn3 = WordNetCorpusReader('D:\\dialogue2020\\semeval-2016-task-14\\WN3.0',
                          None)
input_path = "D:/dialogue2020/semeval-2016-task-14/reader/"
vector_path = "models/vectors/fasttext/en/"

# vectorize wordnet
noun_synsets = compute_synsets_from_wordnets(wn2, wn3, 'n')
verb_synsets = compute_synsets_from_wordnets(wn2, wn3, 'v')
Beispiel #3
0
import nltk
from nltk.corpus import WordNetCorpusReader
from sqlalchemy import *
from xml.dom import minidom
from nltk.corpus import wordnet as wn

import difflib
import pickle

#wordnet-1.6 の読み込み
cwd = os.getcwd()
nltk.data.path.append(cwd)
wordnet16_dir = "resources/wordnet-1.6/"
wn16_path = "{0}/dict".format(wordnet16_dir)
WN16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)),
                           nltk.data.find(wn16_path))


# load Wordnet-Affect synsets
# corpus: a-synset.xml
# return: {
#   'noun': {
#     '05586574': { 'categ': 'electricity', 'pos': 'noun', 'offset16': '05586574' }
#   }, ...
# }
def load_asynsets(corpus):
    tree = ET.parse(corpus)
    root = tree.getroot()

    asynsets = {}
    for pos in ["noun", "adj", "verb", "adv"]:
Beispiel #4
0
import os
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import *
from sqlalchemy import *
from xml.dom import minidom
from sqlite3 import dbapi2 as sqlite
from functools import reduce

cwd = os.getcwd()
os.environ["NLTK_DATA"] = cwd
import nltk
from nltk.corpus import WordNetCorpusReader

WN16_DIR = "resources/wordnet-1.6/dict"
WN30_DIR = "resources/WordNet-3.0/dict"
WN16 = WordNetCorpusReader(cwd + "/" + WN16_DIR, nltk.data.find(WN16_DIR))
WN = WordNetCorpusReader(cwd + "/" + WN30_DIR, nltk.data.find(WN30_DIR))
DB = create_engine('sqlite:///resources/wnjpn.db')

# load Wordnet-Affect synsets
# corpus: a-synset.xml
# return: {
#   'noun': {
#     '05586574': { 'categ': 'electricity', 'pos': 'noun', 'offset16': '05586574' }
#   }, ...
# }
def load_asynsets(corpus):
    tree = ET.parse(corpus)
    root = tree.getroot()

    asynsets = {}
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict, wordnet, WordNetCorpusReader, words
from Word import *
from Properties import *

#from sentiwordnet import SentiWordNetCorpusReader, SentiSynset

from AffectTree import *
import pickle

#preparing resources...
#pronounciation dictionary - cmu
pron_dict = cmudict.dict()
#SentiWordNet
#swn = SentiWordNetCorpusReader(swn_filename)
wordnet_1_6 = WordNetCorpusReader(wn_1_6_corpus_root)
#wordnet dict for id search
english_wordlist = list(w.lower() for w in nltk.corpus.words.words())
with open(Properties.anew_filename, 'r') as f:
    anew_list = {
        l.rstrip().split("\t")[0]:
        (float(l.rstrip().split("\t")[2]), float(l.rstrip().split("\t")[4]))
        for l in f.readlines()[1:]
    }
web_text = nltk.Text(word.lower() for word in nltk.corpus.webtext.words())
#if '_word_context_index' not in web_text.__dict__:
#    print 'Building word-context index...'
#    word_context_index = ContextIndex(web_text.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())


def pickle_wn_dict_id():
Beispiel #6
0
def main():
    args = parse_args()

    description1 = "---- File {0} took {1} seconds ----\n"
    description2 = "All: {2}, Found: {3}, Left: {4}"
    description = description1 + description2

    if "ruwordnet_path1" in args:
        file_paths = tqdm([
            os.path.join(x, i) for x, _, z in os.walk(args.corpus_path)
            for i in z
        ])

        # ------------ RuWordnet initialization ------------
        ruwordnet1 = RuWordnet(db_path=args.ruwordnet_path1, ruwordnet_path="")
        ruwordnet2 = RuWordnet(db_path=args.ruwordnet_path2, ruwordnet_path="")
        senses = ruwordnet1.get_all_senses() + ruwordnet2.get_all_senses()
        synset_senses, sense2synset = create_senses_data(senses, args.pos)
        synsets = set(ruwordnet1.get_all_ids(args.pos))
        print(sense2synset)
        # ------------ Find contexts ------------
        # for filename in file_paths:
        #     start_time = time.time()
        #     retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset)
        #     file_paths.set_description(description.format(filename, (time.time() - start_time),
        #                                                   len(synsets), len(found_lemmas),
        #                                                   len(synsets.difference(set(found_lemmas)))))
        #
        # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas)))))
        # print(found_lemmas)
        # print(synsets.difference(set(found_lemmas)))

    if "wordnet_old" in args:
        wordnet_old = WordNetCorpusReader(args.wordnet_old, None)
        wordnet_new = WordNetCorpusReader(args.wordnet_new, None)
        synsets = compute_synsets_from_wordnets(wordnet_old, wordnet_new, 'n')

        for synset in synsets:
            print(
                set([i.name() for i in wordnet_old.synset(synset).lemmas()] +
                    [i.name() for i in wordnet_new.synset(synset).lemmas()]))
        # for filename in file_paths:
        #     start_time = time.time()
        #     retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset)
        #     file_paths.set_description(description.format(filename, (time.time() - start_time),
        #                                                   len(synsets), len(found_lemmas),
        #                                                   len(synsets.difference(set(found_lemmas)))))
        #
        # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas)))))
        # print(found_lemmas)
        # print(synsets.difference(set(found_lemmas)))

    elif "data_path" in args:
        file_paths = tqdm([
            os.path.join(x, i) for x, _, z in os.walk(args.corpus_path)
            for i in z
        ])

        data = read_test_data(args.data_path)
        for filename in file_paths:
            start_time = time.time()
            retrieve_word_positions(filename, args.output_path, data)
            file_paths.set_description(
                description.format(filename, (time.time() - start_time),
                                   len(data), len(found_lemmas),
                                   len(data.difference(set(found_lemmas)))))

        print(
            description2.format(len(data), len(found_lemmas),
                                len(data.difference(set(found_lemmas)))))
        print(found_lemmas)
        print(data.difference(set(found_lemmas)))
Beispiel #7
0

import nltk
from nltk.corpus import wordnet
from nltk.corpus import WordNetCorpusReader
from duplicity.tempdir import default
print('loading wordnet2.0')
wn2 = WordNetCorpusReader(nltk.data.find('corpora/wordnet2.0'), None)
print('done loading')

print('loading wordnet3.0')
wn3 = WordNetCorpusReader(nltk.data.find('corpora/wordnet'), None)
print('done loading')

S2 = wn2.synset
L = wn2.lemma
S3 = wn3.synset


# hash with key is synset_offset of Wordnet 2.0's synset and value is domain
domain_list = {}

# hash with key is synset_offset of Wordnet 2.0's synset and value is synset 2.0
synset2_list = {}

# hash wiht key is synset_offset of WN 3.0's synset and value is synset 3.0
synset3_list = {}
    

def addZero(number):
    numberStr = str(number)
Beispiel #8
0
 def setUp(self):
     self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)