def get_sif(self, s1, s2): s1 = list(s1.lower()) s2 = list(s2.lower()) if len(s1) == 0 or len(s2) == 0: return pd.np.NaN ## English data en_x, en_m = data_io.sentences2idx(s1, self.en_words) en_w = data_io.seq2weight(en_x, en_m, self.en_weight4ind) en_embedding = SIF_embedding.SIF_embedding(self.en_We, en_x, en_w, self.parameters) en_embedding = en_embedding[0] ## German data de_x, de_m = data_io.sentences2idx(s2, self.de_words) de_w = data_io.seq2weight(de_x, de_m, self.de_weight4ind) de_embedding = SIF_embedding.SIF_embedding(self.de_We, de_x, de_w, self.parameters) de_embedding = de_embedding[0] if np.count_nonzero(en_embedding) == 0 or np.count_nonzero( de_embedding) == 0: return -1 score = self.cosine_similarity(en_embedding, de_embedding) return score
def get_sentences_embedding(sentences): """ return: embedding: ndarray, shape (n_samples, vector_space_dim) """ sequence_matrix, mask_matrix = data_io.sentences2idx( sentences, words2index) weight_matrix = data_io.seq2weight(sequence_matrix, mask_matrix, weight4ind) params = sparams.params() params.rmpc = rm_pc embedding = SIF_embedding.SIF_embedding(words_embedding, sequence_matrix, weight_matrix, params) return embedding
def sif_embedding(sents): """ func: 对列表sents赋值句向量 param: sents - 切词后的句子列表 return: 词向量列表 """ weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme(是否去掉最大主成分项) (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx(sents, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights param = params.params() param.rmpc = rmpc embedding = SIF_embedding.SIF_embedding( We, x, w, param) # embedding[i,:] is the embedding for sentence i return embedding
def arora(word_vectors, term_frequencies, a=.001): """ Aggregates a bag of word vectors to a single vector using “A Simple but Tough-to-Beat Baseline for Sentence Embeddings.” - Arora et al. - 2017 Since word_vectors contain all vectors in a 2d array, the sentence split information must be performed by the term_frequencies array. :param a: Smoothing parameter a (default is 0.001) :param word_vectors: list[n, dim]: ordered word vectors word n (for all sentences) :param term_frequencies: list[i, n]: ordered term frequencies for sentence i and token n within that sentence :return: [i, :] sentence embeddings for sentence i """ if type(word_vectors) is not list: raise TypeError('word_vectors must be a list of shape [n, dim]') if type(term_frequencies) is not list: raise TypeError('term_frequencies must be a list of shape [i, n]') num_sentences = len(term_frequencies) longest_sentence_count = max( [len(sentence) for sentence in term_frequencies]) term_weights = np.zeros((num_sentences, longest_sentence_count)) # Arora expects ONE LARGE WORD VECTOR ARRAY and the INDICES MUST BE OUT OF THIS ARRAY FOR ALL SENTENCES! indices = np.zeros((num_sentences, longest_sentence_count), dtype=np.int) index = 0 for sentence_index, sentence_term_frequencies in enumerate( term_frequencies): for token_index, token_frequency in enumerate( sentence_term_frequencies): term_weights[sentence_index, token_index] = a / (a + token_frequency) indices[sentence_index, token_index] = index index += 1 params = senteval.utils.dotdict({'rmpc': 1}) # remove 1st principal component word_vectors = np.asarray(word_vectors, dtype=np.float64) embeddings = SIF_embedding.SIF_embedding(word_vectors, indices, term_weights, params) return embeddings