예제 #1
0
    def get_sif(self, s1, s2):
        s1 = list(s1.lower())
        s2 = list(s2.lower())

        if len(s1) == 0 or len(s2) == 0:
            return pd.np.NaN

        ## English data
        en_x, en_m = data_io.sentences2idx(s1, self.en_words)
        en_w = data_io.seq2weight(en_x, en_m, self.en_weight4ind)
        en_embedding = SIF_embedding.SIF_embedding(self.en_We, en_x, en_w,
                                                   self.parameters)
        en_embedding = en_embedding[0]

        ## German data
        de_x, de_m = data_io.sentences2idx(s2, self.de_words)
        de_w = data_io.seq2weight(de_x, de_m, self.de_weight4ind)
        de_embedding = SIF_embedding.SIF_embedding(self.de_We, de_x, de_w,
                                                   self.parameters)
        de_embedding = de_embedding[0]

        if np.count_nonzero(en_embedding) == 0 or np.count_nonzero(
                de_embedding) == 0:
            return -1

        score = self.cosine_similarity(en_embedding, de_embedding)
        return score
예제 #2
0
def get_sentences_embedding(sentences):
    """
    return: embedding: ndarray, shape (n_samples, vector_space_dim)
    """

    sequence_matrix, mask_matrix = data_io.sentences2idx(
        sentences, words2index)
    weight_matrix = data_io.seq2weight(sequence_matrix, mask_matrix,
                                       weight4ind)
    params = sparams.params()
    params.rmpc = rm_pc

    embedding = SIF_embedding.SIF_embedding(words_embedding, sequence_matrix,
                                            weight_matrix, params)
    return embedding
예제 #3
0
def sif_embedding(sents):
    """
    func: 对列表sents赋值句向量
    param: sents - 切词后的句子列表
    return: 词向量列表
    """
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme(是否去掉最大主成分项)
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m = data_io.sentences2idx(sents, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, param)  # embedding[i,:] is the embedding for sentence i
    return embedding
예제 #4
0
def arora(word_vectors, term_frequencies, a=.001):
    """
    Aggregates a bag of word vectors to a single vector using
    “A Simple but Tough-to-Beat Baseline for Sentence Embeddings.” - Arora et al. - 2017
    Since word_vectors contain all vectors in a 2d array, the sentence split information must be performed by
    the term_frequencies array.
    :param a: Smoothing parameter a (default is 0.001)
    :param word_vectors: list[n, dim]: ordered word vectors word n (for all sentences)
    :param term_frequencies: list[i, n]: ordered term frequencies for sentence i and token n within that sentence
    :return: [i, :] sentence embeddings for sentence i
    """

    if type(word_vectors) is not list:
        raise TypeError('word_vectors must be a list of shape [n, dim]')
    if type(term_frequencies) is not list:
        raise TypeError('term_frequencies must be a list of shape [i, n]')
    num_sentences = len(term_frequencies)
    longest_sentence_count = max(
        [len(sentence) for sentence in term_frequencies])
    term_weights = np.zeros((num_sentences, longest_sentence_count))
    # Arora expects ONE LARGE WORD VECTOR ARRAY and the INDICES MUST BE OUT OF THIS ARRAY FOR ALL SENTENCES!
    indices = np.zeros((num_sentences, longest_sentence_count), dtype=np.int)
    index = 0
    for sentence_index, sentence_term_frequencies in enumerate(
            term_frequencies):
        for token_index, token_frequency in enumerate(
                sentence_term_frequencies):
            term_weights[sentence_index,
                         token_index] = a / (a + token_frequency)
            indices[sentence_index, token_index] = index
            index += 1
    params = senteval.utils.dotdict({'rmpc':
                                     1})  # remove 1st principal component
    word_vectors = np.asarray(word_vectors, dtype=np.float64)
    embeddings = SIF_embedding.SIF_embedding(word_vectors, indices,
                                             term_weights, params)
    return embeddings