Ejemplo n.º 1
0
    def avg_embed(self, train_data, body_dict, threshold):

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        unrelated, related, y_true, y_pred = [], [], [], []

        for headline, bodyID, stance in train_data:
            headline_w = sent2stokens_wostop(headline)
            body_w = bodyText_w[bodyIds_index[bodyID]]

            sim = avg_embedding_similarity(embeddings, self.embedding_size,
                                           ' '.join(headline_w),
                                           ' '.join(body_w))

            unrelated, related, y_true, y_pred = create_lists(
                sim, stance, threshold, [unrelated, related, y_true, y_pred])

        print_results([unrelated, related, y_true, y_pred], self.model_type)
Ejemplo n.º 2
0
    def doc2vec_similarity_max(self, train_data, body_dict, threshold):
        '''
        :param
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        # Load embeddings
        logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) +
                     ' Embedding size: ' + str(self.embedding_size))

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        # Align body-text in workable format
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        unrelated, related, y_true, y_pred = [], [], [], []
        sentence_list = []

        for headline, bodyID, stance in train_data:
            logging.info("Headline: " + headline)
            score = 0
            bodyText = bodyText_list[bodyIds_index[bodyID]]
            sentence_list = text2sent(bodyText)
            # logging.info("Bodytext: " + bodyText)

            for sentence in sentence_list:
                #logging.info("Sentence: " + sentence)
                # compare both sentences - vectors not necessary, since this procedure works with text
                # note: avg_embeddings_similarity tokenizes and lemmatizes the sentences prior to calculation, so no pre-assessment is necessary (Sentence to tokens without stopwords)
                temp_score = avg_embedding_similarity(embeddings,
                                                      self.embedding_size,
                                                      headline, sentence)
                #logging.info("Similarity: " + str(temp_score))

                # store the highest similarity score
                score = max(score, temp_score)

            # asses headline - body as related or unrelated based on threshold, taken the highest similarity of sentences
            unrelated, related, y_true, y_pred = create_lists(
                score, stance, threshold, [unrelated, related, y_true, y_pred])

            # following lines just for manual cross-checks
            if score <= threshold:
                calculated_stance = "unrelated"
            else:
                calculated_stance = "related"

            logging.info(
                "Best score for this headline - sentence similarity: " +
                str(score))
            logging.info("Real/calculated stance: " + stance + " / " +
                         calculated_stance)

        print_results([unrelated, related, y_true, y_pred], self.model_type)
Ejemplo n.º 3
0
    def word_mover_distance_similarity(self, train_data, body_dict, threshold,
                                       type):
        '''
        :param
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        type: sentence|wholeText: compute distance per sentence or with whole body text
        '''
        # Load embeddings
        #logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) + ' Embedding size: ' + str(self.embedding_size))

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        # Align body-text in workable format
        bodyText_list = list(body_dict.values())
        bodyIds_index = dict(
            (k, index) for index, k in enumerate(list(body_dict.keys())))

        unrelated, related, y_true, y_pred = [], [], [], []
        sentence_list = []

        for headline, bodyID, stance in train_data:
            #logging.info("Headline: " + headline)

            distance = 99999
            bodyText = bodyText_list[bodyIds_index[bodyID]]
            sentence_list = text2sent(bodyText)
            #logging.info("Bodytext: " + bodyText)
            if type == "sentence":
                for sentence in sentence_list:
                    #logging.info("Sentence: " + sentence)
                    temp_distance = abs(
                        computeAverageWMD(embeddings, headline, sentence))

                    # store the lowest distance
                    distance = min(distance, temp_distance)

                    #Note: Distance is not normallized!!
            elif type == "wholeText":
                distance = abs(
                    computeAverageWMD(embeddings, headline, bodyText))

            unrelated, related, y_true, y_pred = create_lists_distance_based(
                distance, stance, threshold,
                [unrelated, related, y_true, y_pred])
            if distance <= threshold:
                calculated_stance = "related"
            else:
                calculated_stance = "unrelated"

            #logging.info("Best word_mover_distance for this headline - body combination: " + str(distance))
            #logging.info("Real/calculated stance: " + stance + " / " + calculated_stance)

        print_results_distance_based([unrelated, related, y_true, y_pred],
                                     self.model_type)
 def load_embeddings(headlines, bodies):
     # embedding parameters:
     embedding_size = 300
     vocab_size = 3000000
     embeddPath = "%s/data/embeddings/google_news/GoogleNews-vectors-negative300.bin.gz" % (
         path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
     embeddData = path.normpath("%s/data/" %
                                (path.dirname(path.abspath(embeddPath))))
     binary_val = True
     embeddings = LoadEmbeddings(filepath=embeddPath,
                                 data_path=embeddData,
                                 vocab_size=vocab_size,
                                 embedding_size=embedding_size,
                                 binary_val=binary_val)
     #     print('Loaded embeddings: Vocab-Size: ' + str(vocab_size) + ' \n Embedding size: ' + str(embedding_size))
     return embedding_size, embeddings
                featureVec = np.add(featureVec, model.word2embedd(word))
            else:
                featureVec = np.add(featureVec, model.word2embedd(u"unknown"))

        if(nwords>0):
            featureVec = np.divide(featureVec, nwords)
        return featureVec

def avg_embedding_similarity(embeddings, embedding_size, sent1, sent2):
    #print("Calculating similarity for: " + sent1 + "\n and\n" + sent2)
    v1 = avg_feature_vector(sent1, model=embeddings, num_features=embedding_size)
    v2 = avg_feature_vector(sent2, model=embeddings, num_features=embedding_size)
    cosine_distance = spatial.distance.cosine(v1, v2)
    score =  1 - cosine_distance
    #print("Score = " + str(score))
    return score

if __name__ == "__main__":
    sent1 = "United States of America"
    sent2 = "USA"
    data_path = myConstants.BASE_DIR + "/data/embeddings"
        
    embeddPath = os.path.normpath("%s/google_news/GoogleNews-vectors-negative300.bin.gz" % (data_path))
    embeddData = os.path.normpath("%s/google_news/data/" % (data_path))
    vocab_size = 3000000
    embedding_size = 300
    
    embeddings = LoadEmbeddings(filepath=embeddPath, data_path=embeddData, vocab_size=vocab_size, embedding_size=embedding_size)
    score = avg_embedding_similarity(embeddings, embedding_size, sent1, sent2)
    print(score)