Beispiel #1
0
 def prepare(self, all_sents):
     all_sents = [sent for sent_list in all_sents for sent in sent_list]
     self.generate_wordvecs()
     sentences = [
         ' '.join(list(jieba.cut(sent))[:self.sent_len_limit])
         for sent in all_sents
     ]
     weightpara = 1e-3
     (words, We) = data_io.getWordmap(self.wordvec)
     # load word weights
     word2weight = data_io.getWordWeight(
         self.wordfreq,
         weightpara)  # word2weight['str'] is the weight for the word 'str'
     weight4ind = data_io.getWeight(
         words,
         word2weight)  # weight4ind[i] is the weight for the i-th word
     # load sentences
     x, w, sents_no_ind = data_io.sentences2idx2(sentences, words,
                                                 weight4ind)
     emb = SIF_embedding.get_weighted_average2(We, x, w)
     del x, w
     gc.collect()
     pc = SIF_embedding.get_pc(emb, self.params_.rmpc)
     self.words = words
     self.We = We
     self.weight4ind = weight4ind
     self.pc = pc
Beispiel #2
0
    if params.clip == 0:
        params.clip = None
params.minval = args.minval
params.maxval = args.maxval
if args.nonlinearity:
    if args.nonlinearity == 1:
        params.nonlinearity = lasagne.nonlinearities.linear
    if args.nonlinearity == 2:
        params.nonlinearity = lasagne.nonlinearities.tanh
    if args.nonlinearity == 3:
        params.nonlinearity = lasagne.nonlinearities.rectify
    if args.nonlinearity == 4:
        params.nonlinearity = lasagne.nonlinearities.sigmoid

# load data
(words, We) = data_io.getWordmap(params.wordfile)
if args.task == "sim" or args.task == "ent":
    train_data = data_io.getSimEntDataset(params.traindata, words, params.task)
elif args.task == "sentiment":
    train_data = data_io.getSentimentDataset(params.traindata, words)
else:
    raise ValueError('Task should be ent, sim, or sentiment.')

# load weight
if params.weightfile:
    word2weight = data_io.getWordWeight(params.weightfile, params.weightpara)
    params.weight4ind = data_io.getWeight(words, word2weight)
    print('word weights computed using parameter a=' + str(params.weightpara))
else:
    params.weight4ind = []
if params.npc > 0:
import sys
sys.path.append('../src')
from SIF import data_io, params, SIF_embedding

# input
wordfile = '../data/glove.840B.300d.txt'  # word vector file, can be downloaded from GloVe website
weightfile = '../auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1  # number of principal components to remove in SIF weighting scheme
sentences = [
    'this is an example sentence',
    'this is another sentence that is slightly longer'
]

# load word vectors
(words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    words, word2weight)  # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(
    sentences, words
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
from SIF import data_io, params, SIF_embedding
glove_file = 'vectors.txt'

word_freauency_path = 'vocab.txt'
weight_params = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme

(Word2Indx, Word2vector) = data_io.getWordmap(glove_file)    
Word2Weight = data_io.getWordWeight(word_freauency_path, weight_params) 
Index2Weight= data_io.getWeight(Word2Indx, Word2Weight) 

def embeding_sentence_cosine_similarity(s1,s2):    
    word_idx_seq_of_sentence, mask = data_io.sentences2idx([s1,s2], Word2Indx) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    print(s1,s2)
    print('word_idx_seq_of_sentence')
    print(word_idx_seq_of_sentence)
    print('mask')
    print(mask)
    word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights
    # set parameters
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) 
    s1_embed = embedding[0]
    s2_embed = embedding[1]    

    return distance.cosine(s1_embed,s2_embed)