Ejemplos de SparseTermSimilarityMatrix.inner_product en Python, ejemplos de gensim.similarities.SparseTermSimilarityMatrix.inner_product en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: SemanticMeasure.py Proyecto: alfarady/DictionaryCosinSimilarity

    def walid_similarity_query(self, answer: str, key: str):
        if len(answer) == 0 or len(key) == 0:
            return False

        if self.model_ready:
            documents = [answer, key]

            if self.verbose:
                print(
                    f'{len(documents)} documents loaded and ready to preprocess'
                )

            corpus = [self.preprocess(document) for document in documents]

            if self.verbose:
                print(f'{len(corpus)} documents loaded into corpus')

            dictionary = Dictionary(corpus)
            tfidf = TfidfModel(dictionary=dictionary)
            similarity_matrix = SparseTermSimilarityMatrix(
                self.similarity_index, dictionary, tfidf)

            answer_bow = dictionary.doc2bow(self.preprocess(answer))
            key_bow = dictionary.doc2bow(self.preprocess(key))

            # Measure soft cosine similarity
            scores = similarity_matrix.inner_product(answer_bow,
                                                     key_bow,
                                                     normalized=True)

            return scores

        else:
            raise NotReadyError('Word embedding model is not ready.')

Ejemplo n.º 2

0

Mostrar archivo

Archivo: asn3.py Proyecto: dddd999/asn3

def W2VH():
    docbrown = ""
    for w in brown.words(categories='mystery'):
        docbrown += str(w.lower().split())

    docbrown1, docbrown2 = docbrown[:int(len(docbrown) /
                                         2)], docbrown[int(len(docbrown) / 2):]

    stop_words = stopwords.words('english')
    docbrown1 = [w for w in docbrown1 if w not in stop_words]
    docbrown2 = [w for w in docbrown2 if w not in stop_words]

    documents = [docbrown1, docbrown2]
    dictionary = corpora.Dictionary(documents)

    docbrown1 = dictionary.doc2bow(docbrown1)
    docbrown2 = dictionary.doc2bow(docbrown2)

    model = Word2Vec(common_texts, size=20, min_count=1)
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)

    similarity = similarity_matrix.inner_product(docbrown1,
                                                 docbrown2,
                                                 normalized=True)
    print('= %.4f' % similarity)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: similarity.py Proyecto: tejeshreddy/ClickbaitDetector

class Similarity:
    def __init__(self):
        self.stop_words = stopwords.words('english')
        self.w2v_model = api.load("glove-wiki-gigaword-50")
        self.similarity_index = WordEmbeddingSimilarityIndex(self.w2v_model)

    def make_document(self, headline, articles):
        temp = []
        headline = [
            w for w in headline.lower().split() if w not in self.stop_words
        ]
        for article in articles:
            article = [
                w for w in article.lower().split() if w not in self.stop_words
            ]
            temp.append(article)
        self.documents = [headline] + temp
        dictionary = corpora.Dictionary(self.documents)
        self.similarity_matrix = SparseTermSimilarityMatrix(
            self.similarity_index, dictionary)
        headline = dictionary.doc2bow(headline)
        articles = [dictionary.doc2bow(i) for i in temp]
        similarities = []
        for i in articles:
            similarities.append(self.get_similarity(headline, i))
        return similarities

    def get_similarity(self, s1, s2):
        return self.similarity_matrix.inner_product(s1, s2, normalized=True)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: eval.py Proyecto: LeyliG/ds4se

class Word2VecSeqVect(BasicSequenceVectorization):
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load(
            params['path_to_trained_model'])
        self.new_model.init_sims(
            replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(
            self.similarity_index, self.dictionary)

        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim
        }

    def wmd_gensim(self, sentence_a, sentence_b):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]

    def wmd_similarity(self, dist):
        return 1. / (1. + float(dist))  #Associated Similarity

    def scm_gensim(self, sentence_a, sentence_b):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)

        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1,
                                                              sentence_2,
                                                              normalized=True)
        return [1 - scm_similarity, scm_similarity]

    def distance(self, metric_list, link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for
        #the remaining metrics
        sentence_a = self.df_source[self.df_source['ids'].str.contains(
            link[0])]['text'].values[0].split()
        sentence_b = self.df_target[self.df_target['ids'].str.contains(
            link[1])]['text'].values[0].split()

        dist = [
            self.dict_distance_dispatcher[metric](sentence_a, sentence_b)
            for metric in metric_list
        ]
        logging.info("Computed distances or similarities " + str(link) +
                     str(dist))
        return functools.reduce(lambda a, b: a + b,
                                dist)  #Always return a list

Ejemplo n.º 5

0

Mostrar archivo

Archivo: miningtools.py Proyecto: rossahq/pickaxe

    def calculate_soft_cosine_similarity(self, topic_models, sentences, *args,
                                         **kwargs):

        topic_claim_relations = {}
        for topic in topic_models:
            topic_claim_relations[topic] = []

        documents = []
        for topic in topic_models:
            documents.append(topic.lower().split())
        for sentence in sentences:
            documents.append(sentence.lower().split())
        dictionary = corpora.Dictionary(documents)

        w2v_model = api.load("glove-wiki-gigaword-100")
        similarity_index = WordEmbeddingSimilarityIndex(w2v_model)
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                       dictionary)

        for sentence in sentences:
            best_cosine_result = 0
            x = 0
            normal_sentence = sentence
            sentence = sentence.lower().split()

            stop_words = stopwords.words('english')
            sentence = [w for w in sentence if w not in stop_words]

            while x <= len(topic_models) - 1:

                topic_model = (topic_models[x]).lower().split()
                topic_model = [w for w in topic_model if w not in stop_words]

                topic_model_bow = dictionary.doc2bow(topic_model)
                sentence_bow = dictionary.doc2bow(sentence)

                similarity = similarity_matrix.inner_product(topic_model_bow,
                                                             sentence_bow,
                                                             normalized=True)
                print('similarity = %.4f' % similarity)

                if similarity > best_cosine_result:
                    best_cosine_result = similarity
                    matched_topic = topic_models[x]

                if x == len(topic_models) - 1:
                    if best_cosine_result > 0.3:
                        topic_claim_relations[matched_topic].append(
                            normal_sentence)

                x = x + 1
        return topic_claim_relations

Ejemplo n.º 6

0

Mostrar archivo

def calculate_softcosine_w2v(test_data):
    data = [i.split() for i in (test_data.text).tolist()]
    dictionary = corpora.Dictionary(data)
    corpus = [dictionary.doc2bow(d) for d in data]

    similarity_index = WordEmbeddingSimilarityIndex(w2v_model)
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary)

    softsim_w2v_matrix = np.empty(shape=(len(data), len(data))) * np.nan
    for d1 in range(0, len(data)):
        for d2 in range(0, len(data)):
            softsim_w2v_matrix[d1, d2] = similarity_matrix.inner_product(
                corpus[d1], corpus[d2], normalized=True)

    doc_sim_max_index, doc_sim_max_values = calculate_max_similarity(
        softsim_w2v_matrix)
    softsim_w2v_df = export_result(test_data, doc_sim_max_index,
                                   doc_sim_max_values, 'softsim_w2v')
    print(
        "Similarity using soft cosine similarity using w2v vectors is calculated!!"
    )
    return softsim_w2v_df

Ejemplo n.º 7

0

Mostrar archivo

    corpus_neg_reviews = [neg_dictionary.doc2bow(text) for text in list(df_negative_sentences['review_sentence_cleaned'])]
    corpus_neg_topics = [neg_dictionary.doc2bow(text) for text in topics_cleaned]

    # build similarity matrix of word embeddings
    print('Building similarity matrix of word embeddings. Might take a few minutes...')
    termsim_index = WordEmbeddingSimilarityIndex(fasttext_model300)
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index,neg_dictionary)
    print('done')

    # compute soft cosine similarity between sentences and topics
    print('Computing soft cosine similarity between sentences and topics. Might take a few minutes...')
    neg_data_topics = []
    for review_item in corpus_neg_reviews:
        review_item_topics = []
        for topic in corpus_neg_topics:
            review_item_topics.append(similarity_matrix.inner_product(review_item,topic,normalized=True))
        neg_data_topics.append(review_item_topics)
    print('done')    

    # extract topic with highest soft cosine similarity
    # I set a minimum threshold (0.10) that needs to be reached in order to assign a topic.
    # If above-threshold topics are within 0.01, I assign -1 (i.e. no main topic)  
    neg_data_closest_topic = []
    cossim_threshold = 0.10

    for review_item_topic_list in neg_data_topics:
        if max(review_item_topic_list)>cossim_threshold:
            review_item_array = np.array(review_item_topic_list)
            sorted_review_item_array = sorted(review_item_array,reverse=True)
            num_topics=1
            for item in sorted_review_item_array[1:]:

Ejemplo n.º 8

0

Mostrar archivo

class Word2VecSeqVect(BasicSequenceVectorization):

    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)

        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim,
            EntropyMetric.MSI_I: self.msi
        }

    def wmd_gensim(self, sentence_a, sentence_b ):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]

    def wmd_similarity(self, dist):
        return 1./( 1.+float( dist ) ) #Associated Similarity

    def scm_gensim(self, sentence_a, sentence_b ):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)

        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True)
        return [1-scm_similarity, scm_similarity]

    def msi(self, sentence_a, sentence_b):
        '''@danaderp
        Minimum Shared Information'''
        token_counts_1 = self.get_cnts(sentence_a, self.vocab)
        token_counts_2 = self.get_cnts(sentence_b, self.vocab)
        logging.info('token count processed')

        #Minimum Shared Tokens
        token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in self.vocab }

        alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ]
        frequencies = self.get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ]
        logging.info('frequencies processed')

        if not frequencies:
            #"List is empty"
            entropies = float('nan')
            extropies = float('nan')
        else:
            scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )]
            logging.info('scalar_distribution processed')

            entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ]
            logging.info('entropies processed')

            extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ]
            logging.info('extropies processed')
        return [entropies,extropies]


    def distance(self, metric_list,link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for
        #the remaining metrics
        ids = parameters['system_path_config']['names'][0]
        txt = parameters['system_path_config']['names'][1]

        if self.params['system_path_config']['prep'] == Preprocessing.conv: #if conventional preprocessing
            sentence_a = self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0].split()
            sentence_b = self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0].split()
        elif self.params['system_path_config']['prep'] == Preprocessing.bpe:
            sentence_a = eval(self.df_source[self.df_source[ids].str.contains(link[0])][txt].values[0])
            sentence_b = eval(self.df_target[self.df_target[ids].str.contains(link[1])][txt].values[0])

        dist = [ self.dict_distance_dispatcher[metric](sentence_a,sentence_b) for metric in metric_list]
        logging.info("Computed distances or similarities "+ str(link) + str(dist))
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list

    #################################3TODO substitute this block in the future by importing information science module
    def get_cnts(self, toks, vocab):
        '''@danaderp
        Counts tokens within ONE document'''
        #logging.info("encoding_size:" len
        cnt = Counter(vocab)
        for tok in toks:
            cnt[tok] += 1
        return cnt

    def get_freqs(self, dict_token_counts):

        num_tokens = sum( dict_token_counts.values() ) #number of subwords inside the document
        if num_tokens == 0.0:
            frequencies = []
            logging.info('---------------> NO SHARED INFORMATION <-------------------------')
        else:
            frequencies = [ (dict_token_counts[token])/num_tokens for token in dict_token_counts ]
        return frequencies

Ejemplo n.º 9

0

Mostrar archivo

    def test_inner_product(self):
        """Test the inner product."""

        matrix = SparseTermSimilarityMatrix(
            UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5), self.dictionary)

        # check zero vectors work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])

        self.assertEqual(0.0, matrix.inner_product([], vec2))
        self.assertEqual(0.0, matrix.inner_product(vec1, []))
        self.assertEqual(0.0, matrix.inner_product([], []))

        self.assertEqual(0.0, matrix.inner_product([], vec2, normalized=True))
        self.assertEqual(0.0, matrix.inner_product(vec1, [], normalized=True))
        self.assertEqual(0.0, matrix.inner_product([], [], normalized=True))

        # check that real-world vectors work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        result = matrix.inner_product(vec1, vec2)
        self.assertAlmostEqual(expected_result, result, places=5)

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        result = matrix.inner_product(vec1, vec2, normalized=True)
        self.assertAlmostEqual(expected_result, result, places=5)

        # check that real-world (vector, corpus) pairs work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        expected_result = numpy.full((1, 2), expected_result)
        result = matrix.inner_product(vec1, [vec2] * 2)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        expected_result = numpy.full((1, 2), expected_result)
        result = matrix.inner_product(vec1, [vec2] * 2, normalized=True)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        # check that real-world (corpus, vector) pairs work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        expected_result = numpy.full((3, 1), expected_result)
        result = matrix.inner_product([vec1] * 3, vec2)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        expected_result = numpy.full((3, 1), expected_result)
        result = matrix.inner_product([vec1] * 3, vec2, normalized=True)
        self.assertTrue(isinstance(result, numpy.ndarray))
        self.assertTrue(numpy.allclose(expected_result, result))

        # check that real-world corpora work as expected
        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = 0.0
        expected_result += 2 * 1.0 * 1  # government * s_{ij} * government
        expected_result += 2 * 0.5 * 1  # government * s_{ij} * holiday
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * government
        expected_result += 1 * 0.5 * 1  # denied * s_{ij} * holiday
        expected_result = numpy.full((3, 2), expected_result)
        result = matrix.inner_product([vec1] * 3, [vec2] * 2)
        self.assertTrue(isinstance(result, scipy.sparse.csr_matrix))
        self.assertTrue(numpy.allclose(expected_result, result.todense()))

        vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"])
        vec2 = self.dictionary.doc2bow([u"government", u"holiday"])
        expected_result = matrix.inner_product(vec1, vec2)
        expected_result /= math.sqrt(matrix.inner_product(vec1, vec1))
        expected_result /= math.sqrt(matrix.inner_product(vec2, vec2))
        expected_result = numpy.full((3, 2), expected_result)
        result = matrix.inner_product([vec1] * 3, [vec2] * 2, normalized=True)
        self.assertTrue(isinstance(result, scipy.sparse.csr_matrix))
        self.assertTrue(numpy.allclose(expected_result, result.todense()))

Ejemplo n.º 10

0

Mostrar archivo

# a term similarity mextrix using the embeddings.
#
# .. Important::
#   The embeddings we have chosen here require a lot of memory.
#
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
termsim_index = WordEmbeddingSimilarityIndex(model)
termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf)

###############################################################################
# So let's compute SCM using the ``inner_product`` method.
#
similarity = termsim_matrix.inner_product(sentence_obama, sentence_president, normalized=(True, True))
print('similarity = %.4f' % similarity)

###############################################################################
# Let's try the same thing with two completely unrelated sentences.
# Notice that the similarity is smaller.
#
similarity = termsim_matrix.inner_product(sentence_obama, sentence_orange, normalized=(True, True))
print('similarity = %.4f' % similarity)

###############################################################################
# 
# References
# ----------
#
# 1. Grigori Sidorov et al. *Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model*, 2014.

Ejemplo n.º 11

0

Mostrar archivo

    def retrieve_paragraphs(self, file_name, question_input,
                            similarity_threshold):
        # Reading file names and read them paragraph by paragraph.
        with open(file_name, "r", encoding="utf-8") as fp:
            documents = fp.readlines()

        # Append question to the end of the document and pre-process everything all at once.
        documents.append(question_input)

        # convert everything to lower case for better processing.
        documents = [doc.lower() for doc in documents]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        documents_array = []
        self.remove_stopwords(documents_array, stop_words, documents)

        # Prepare a dictionary and a corpus.
        # Convert a document into a list of tokens.
        # This module implements the concept of a Dictionary -- a mapping between paragraphs and their integer ids.
        dictionary = corpora.Dictionary([word for word in documents_array])

        # Convert the sentences into bag-of-words vectors.
        sentences = []
        self.convert_sentences(sentences, dictionary, documents_array)

        # Another way of computing the similarity matrix
        # similarity_index = WordEmbeddingSimilarityIndex(self.loaded_model)
        similarity_index = WordEmbeddingSimilarityIndex.load(
            'sparse_model.pkl')
        similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                       dictionary)
        # pickle.dump(similarity_matrix, open('matrix-file.sav', 'wb'))
        # file_matrix = open('matrix-file.sav', 'rb')
        # loaded_similarity = pickle.load(open('matrix-file.sav', 'rb'))

        result_matrix = []
        result_dict = OrderedDict()

        questionInput = sentences[-1]
        for index in range(len(sentences) - 1):
            result_matrix.append(
                similarity_matrix.inner_product(questionInput,
                                                sentences[index],
                                                normalized=True))

        for index, result in enumerate(result_matrix):
            result_dict[index] = result

        result_dict = OrderedDict(
            sorted(result_dict.items(), key=itemgetter(1), reverse=True))
        para_index = [
            list(result_dict.keys())[1],
            list(result_dict.keys())[2],
            list(result_dict.keys())[3],
            list(result_dict.keys())[4],
            list(result_dict.keys())[5]
        ]

        print(" -- FILE NAME: ", file_name)
        similarity_value = list(result_dict.values())[1]
        count = 0
        doc_paragraph = ''
        for doc in documents:
            if doc != '\n':
                if count in para_index:
                    print(" -- This is the paragraph: ")
                    print(doc)
                    doc_paragraph = doc_paragraph + ' ' + doc
                count += 1

        answer = "There is no answer for this document!"
        print(" -- This is the span of words:")
        similarity_threshold = 0.3
        if similarity_value > similarity_threshold:
            predictor = Predictor.from_path(
                "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2018.11.30-charpad.tar.gz"
            )
            prediction = predictor.predict(question=question_input,
                                           passage=doc_paragraph)
            answer = prediction["best_span_str"]
        else:
            print("\033[44;33m%s!\033[m" % answer)
        print("------------------------")

        doc_paragraph = doc_paragraph.rstrip("\n")
        wrapper = textwrap.TextWrapper(width=190)

        word_list = wrapper.fill(text=doc_paragraph)
        word_list = word_list.replace(answer,
                                      '\033[44;33m{}\033[m'.format(answer))

        print(word_list)
        return word_list