Ejemplo n.º 1
0
 def prepare(self, all_sents):
     all_sents = [sent for sent_list in all_sents for sent in sent_list]
     self.generate_wordvecs()
     sentences = [
         ' '.join(list(jieba.cut(sent))[:self.sent_len_limit])
         for sent in all_sents
     ]
     weightpara = 1e-3
     (words, We) = data_io.getWordmap(self.wordvec)
     # load word weights
     word2weight = data_io.getWordWeight(
         self.wordfreq,
         weightpara)  # word2weight['str'] is the weight for the word 'str'
     weight4ind = data_io.getWeight(
         words,
         word2weight)  # weight4ind[i] is the weight for the i-th word
     # load sentences
     x, w, sents_no_ind = data_io.sentences2idx2(sentences, words,
                                                 weight4ind)
     emb = SIF_embedding.get_weighted_average2(We, x, w)
     del x, w
     gc.collect()
     pc = SIF_embedding.get_pc(emb, self.params_.rmpc)
     self.words = words
     self.We = We
     self.weight4ind = weight4ind
     self.pc = pc
Ejemplo n.º 2
0
 def sents_embedding(self, sentences):
     sentences = [
         ' '.join(list(jieba.cut(st))[:self.sent_len_limit])
         for st in sentences
     ]
     x, w, sents_no_ind = data_io.sentences2idx2(sentences, self.words,
                                                 self.weight4ind)
     embedding_original = SIF_embedding.get_weighted_average2(self.We, x, w)
     embedding = SIF_embedding.embedding_remove_pc(embedding_original,
                                                   self.pc,
                                                   self.params_.rmpc)
     return embedding, sents_no_ind
Ejemplo n.º 3
0
def weighted_average_sim_rmpc(We, x1, x2, w1, w2, params):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x1: x1[i, :] are the indices of the words in the first sentence in pair i
    :param x2: x2[i, :] are the indices of the words in the second sentence in pair i
    :param w1: w1[i, :] are the weights for the words in the first sentence in pair i
    :param w2: w2[i, :] are the weights for the words in the first sentence in pair i
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: scores, scores[i] is the matching score of the pair i
    """
    emb1 = SIF_embedding.SIF_embedding(We, x1, w1, params)
    emb2 = SIF_embedding.SIF_embedding(We, x2, w2, params)

    inn = (emb1 * emb2).sum(axis=1)
    emb1norm = np.sqrt((emb1 * emb1).sum(axis=1))
    emb2norm = np.sqrt((emb2 * emb2).sum(axis=1))
    scores = inn / emb1norm / emb2norm
    return scores
Ejemplo n.º 4
0
def embeding_sentence_cosine_similarity(s1,s2):    
    word_idx_seq_of_sentence, mask = data_io.sentences2idx([s1,s2], Word2Indx) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    print(s1,s2)
    print('word_idx_seq_of_sentence')
    print(word_idx_seq_of_sentence)
    print('mask')
    print(mask)
    word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights
    # set parameters
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) 
    s1_embed = embedding[0]
    s2_embed = embedding[1]    

    return distance.cosine(s1_embed,s2_embed)
Ejemplo n.º 5
0
def cosine_distance_by_sentence_vector(s1, s2):
    word_idx_seq_of_sentence, mask = data_io.sentences2idx(
        [' '.join(s1), ' '.join(s2)], Word2Indx
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location

    word_weight_of_sentence = data_io.seq2weight(
        word_idx_seq_of_sentence, mask, Index2Weight)  # get word weights
    # set parameters
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(Word2vector,
                                            word_idx_seq_of_sentence,
                                            word_weight_of_sentence, param)
    s1_embed = embedding[0]
    s2_embed = embedding[1]

    return cosine_similarity(s1_embed, s2_embed)
Ejemplo n.º 6
0
# input
wordfile = '../data/glove.840B.300d.txt'  # word vector file, can be downloaded from GloVe website
weightfile = '../auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1  # number of principal components to remove in SIF weighting scheme
sentences = [
    'this is an example sentence',
    'this is another sentence that is slightly longer'
]

# load word vectors
(words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    words, word2weight)  # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(
    sentences, words
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(
    We, x, w, params)  # embedding[i,:] is the embedding for sentence i