def build_graph(data, mode, graph=None, rel=None, pos_list=None, modelpath=None):
    '''
    4 usages:
        graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt')
        graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim'])
        graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt')
        graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim'])
    '''

    word_set = set()
    graph = graph if graph else collections.defaultdict(dict)
    wnl = nltk.WordNetLemmatizer()

    if modelpath:
        model = txt_to_wordvecmodel(modelpath)
        model_wordset = set(model.keys())
    else:
        model = None

    for i, word in enumerate(data):
        #225534 words in data
        if i % 10000 == 0: print i
        if type(word) == str:
            word = word.decode('utf-8')
        word = word.lower()
        word_exist_in_wordnet = wn.morphy(word) ### bring lots of noises
        if word_exist_in_wordnet:
            word = wnl.lemmatize(word)
            word_set.add(word)
            ## 35304 remain

            pos_list = pos_filter(word, LJ40K, ['s','a'])
            if pos_list == None: pos_list=[None]
            for i,p in enumerate(pos_list):
                related_words = get_all_synonyms_antonyms(word, rel, p)
                rws = list(related_words)
                if rws:
                    for w in rws:
                        related_word = w[0].lower()
                        related_word_type = w[2]
                        if model:
                            if related_word not in model_wordset or word not in model_wordset: continue
                            if mode == 'calculate_path':
                                graph_for_calculate_path(word, related_word, graph, model)
                            elif mode == 'calculate_score':
                                graph_for_calculate_score(word, related_word, related_word_type, graph, model)
                        else:
                            if mode == 'calculate_path':
                                graph_for_calculate_path(word, related_word, graph)
                            elif mode == 'calculate_score':
                                graph_for_calculate_score(word, related_word, related_word_type, graph)
    return graph, word_set
Example #2
0
def build_graph(data,
                mode,
                graph=None,
                rel=None,
                pos_list=None,
                modelpath=None):
    '''
    4 usages:
        graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt')
        graph,wordset = build_graph(data,'calculate_path', rel=['ss','ant','sim'])
        graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim'], modelpath='../data/model/glove.42B.300d.txt')
        graph,wordset = build_graph(data,'calculate_score', rel=['ss','ant','sim'])
    '''

    word_set = set()
    graph = graph if graph else collections.defaultdict(dict)
    wnl = nltk.WordNetLemmatizer()

    if modelpath:
        model = txt_to_wordvecmodel(modelpath)
        model_wordset = set(model.keys())
    else:
        model = None

    for i, word in enumerate(data):
        #225534 words in data
        if i % 10000 == 0: print i
        if type(word) == str:
            word = word.decode('utf-8')
        word = word.lower()
        word_exist_in_wordnet = wn.morphy(word)  ### bring lots of noises
        if word_exist_in_wordnet:
            word = wnl.lemmatize(word)
            word_set.add(word)
            ## 35304 remain

            pos_list = pos_filter(word, LJ40K, ['s', 'a'])
            if pos_list == None: pos_list = [None]
            for i, p in enumerate(pos_list):
                related_words = get_all_synonyms_antonyms(word, rel, p)
                rws = list(related_words)
                if rws:
                    for w in rws:
                        related_word = w[0].lower()
                        related_word_type = w[2]
                        if model:
                            if related_word not in model_wordset or word not in model_wordset:
                                continue
                            if mode == 'calculate_path':
                                graph_for_calculate_path(
                                    word, related_word, graph, model)
                            elif mode == 'calculate_score':
                                graph_for_calculate_score(
                                    word, related_word, related_word_type,
                                    graph, model)
                        else:
                            if mode == 'calculate_path':
                                graph_for_calculate_path(
                                    word, related_word, graph)
                            elif mode == 'calculate_score':
                                graph_for_calculate_score(
                                    word, related_word, related_word_type,
                                    graph)
    return graph, word_set
Example #3
0
# -*- coding: utf8 -*-
import sys, os
sys.path.append("../")
from models.modeltools import txt_to_wordvecmodel, wordvecmodel_to_txt, wordvecmodel_filter
import cPickle as pickle

'''
input:
model1, model2, filter_set, filepath1, filepath2
'''
if __name__ == '__main__':

    model1 = txt_to_wordvecmodel(filepath='../data/model/glove.42B.300d.txt')
    model2 = pickle.load(open('../data/model/model_wordvec_semantic_similarity_lemma_35304_42b_Feeling_Wheel.pkl'))
    filter_set = pickle.load(open('../data/wordset/wordsetlemma_basickeyword_LJ40K_FeelingWheel_1122.pkl'))

    model1 = wordvecmodel_filter(model1, filter_set)
    model2 = wordvecmodel_filter(model2, filter_set)

    filter_set = set(model1.keys()) & set(model2.keys())

    model1 = wordvecmodel_filter(model1, filter_set)
    model2 = wordvecmodel_filter(model2, filter_set)
    

    print 'length of the filtered model1 is ',len(model1)
    print 'length of the filtered model2 is ',len(model2)

    filepath1 = '../textSNE/testdata/w2v_worddict_42B_1122.txt'
    filepath2 = '../textSNE/testdata/model_wordvec_semantic_similarity_lemma_35304_42b_Feeling_Wheel_1122.txt'