Exemple #1
0
	def compute(self):
		vec_texts = [text.split() for text in self.texts]
		write("\n    "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else ''
		dictionary = Dictionary(vec_texts)
		write("[OK]") if self.debug else ''
		write("\n    "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' 
		corpus = [dictionary.doc2bow(vec) for vec in vec_texts]
		write("[OK]") if self.debug else ''
		write("\n    "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' 
		tfidf_space = TfidfModel(corpus)
		tfidf_corpus = tfidf_space[corpus]
		if self.method == 'TFIDF':
			self.space = tfidf_space
			self.index = MatrixSimilarity(tfidf_corpus)
		elif self.method == 'LSI': 
			self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'RP': 
			self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) 
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		elif self.method == 'LDA':
			self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, 
														 num_topics=self.num_t)
			self.index = MatrixSimilarity(self.space[tfidf_corpus])
		self.dictionary = dictionary
		write("[OK]\n") if self.debug else ''
Exemple #2
0
def build_tfidf_or_lsi(corpus, method='tfidf'):
    '''

    построение модели для ранжирования документов.
    На вход: корпус текстов и метод ("tfidf" или "lsi").
    На выход кортеж: (словарь
    терминов в корпусе текстов,
    оцененная модель и матрица сходств слов)

    '''

    dictionary = Dictionary(corpus)
    corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]
    model_tfidf = TfidfModel(corpus_bow)
    corpus_tfidf = [model_tfidf[doc] for doc in corpus_bow]
    simil_tfidf = MatrixSimilarity(corpus_tfidf)
    if method == 'tfidf':

        return dictionary, model_tfidf, simil_tfidf

    elif method == 'lsi':

        model_lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50)
        corpus_lsi = [model_lsi[doc] for doc in corpus_bow]
        simil_lsi = MatrixSimilarity(corpus_lsi)

        return dictionary, model_lsi, simil_lsi
Exemple #3
0
 def trainModel(self):
     if self.toweight:
         self.model = LsiModel(self.tfidf[self.corpus], num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]])
     else:
         self.model = LsiModel(self.corpus, num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.corpus])
Exemple #4
0
def custom_queries(corpus, dictionary, paragraphs):

    # tfidf query:
    tfidf_model = TfidfModel(corpus, dictionary=dictionary)
    query = process_query("What is the function of money?", dictionary)
    tfidf_query = tfidf_model[query]

    tfidf_corpus = []
    for i in range(len(corpus)):
        tfidf_corpus.append(tfidf_model[corpus[i]])

    tfidf_index = MatrixSimilarity(tfidf_corpus)

    print("tfidf query:")
    doc2similarity_tfidf = enumerate(tfidf_index[tfidf_query])
    for tfidf_index, similarity in sorted(doc2similarity_tfidf,
                                          key=lambda kv: -kv[1])[:3]:
        paragraph = paragraphs[tfidf_index].split("\n")
        number = tfidf_index + 1
        print("[paragraph: " + str(number) + "]")
        for i in range(5):
            print(paragraph[i])
            if (i + 1) == len(paragraph):
                break
        print("\n")

    # lsi query:
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100)
    lsi_query = lsi_model[tfidf_query]

    lsi_corpus = []
    for i in range(len(corpus)):
        lsi_corpus.append(lsi_model[corpus[i]])

    lsi_index = MatrixSimilarity(lsi_corpus)
    doc2similarity_lsi = enumerate(lsi_index[lsi_query])

    print("lsi query:")
    for lsi_index, similarity in sorted(doc2similarity_lsi,
                                        key=lambda kv: -kv[1])[:3]:
        paragraph = paragraphs[lsi_index].split("\n")
        number = lsi_index + 1
        print("[paragraph: " + str(number) + "]")
        for i in range(5):
            print(paragraph[i])
            if (i + 1) == len(paragraph):
                break
        print("\n")
Exemple #5
0
 def recomended_projects(self, request):
     projects = ProjectRequest.objects.all()
     project_keywords_dict = {}
     projects_dict = {}
     tags_list = []
     for project in projects:
         description = project.description
         description_keywords = get_keywords(description.replace('"', ''))
         tags = project.tags.replace('  ', ',').lower() 
         for keyword in description_keywords:
             tags += ',' + keyword[0].lower()
         tags_list.append(tags)
     df = read_frame(projects, fieldnames=['id', 'tags'], index_col=['id'])
     df['tags'] = tags_list
     keywords = df['tags'].tolist()
     keywords = [word_tokenize(keyword.lower()) for keyword in keywords]
     keywords = [no_commas(kw) for kw in keywords]
     processed_keywords = keywords
     dictionary = Dictionary(processed_keywords)
     corpus = [dictionary.doc2bow(doc) for doc in processed_keywords]
     tfidf = TfidfModel(corpus)
     sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
     top_3 = keywords_recommendation(all_projects=df, keywords=['uvg', 'gasolina', 'potente', 'mcdonald', 'mecanico', 'gg', 'carros'], number_of_hits=3, data=[dictionary, tfidf, sims])
     projects = []
     for id in top_3:
         projects.append(ProjectRequestSerializer(ProjectRequest.objects.get(pk=id)).data)
     return Response(projects)
Exemple #6
0
def train_model_get_cosine_matrix(statements, num):

    statements = [statement.split() for statement in statements]
    dictionary = corpora.Dictionary(statements)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in statements]

    ###tfidf model
    # https://stackoverflow.com/questions/50521304/why-i-get-different-length-of-vectors-using-gensim-lsi-model
    tfidf = models.TfidfModel(doc_term_matrix, normalize=True)
    corpus_tfidf = tfidf[doc_term_matrix]

    lsi = models.LsiModel(corpus_tfidf, num_topics=num, id2word=dictionary)

    #turn dictionary into doc2vec
    words = [
        dictionary.doc2bow([word])
        for word in sorted(list(dictionary.token2id.keys()))
    ]

    vectorized_corpus = lsi[words]

    index = MatrixSimilarity(vectorized_corpus)
    index[vectorized_corpus]

    out = pd.DataFrame(index[vectorized_corpus])
    out.columns = sorted(list(dictionary.token2id.keys()))
    out.index = sorted(list(dictionary.token2id.keys()))
    return out
Exemple #7
0
def main(argv):

    if len(sys.argv) != 2:
        print 'usage: text_exp sentence'
        sys.exit(2)

    # encode this sentence into semantic space
    # text = "Rice wheat and barley are all important feed crops."
    # text = "Brazil issues with industrial pollution"

    text = sys.argv[1]

    # first load the basic models
    MODELS_DIR = "models"

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.ERROR)
    dictionary = gensim.corpora.Dictionary.load(
        os.path.join(MODELS_DIR, "mtsamples.dict"))
    corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "mtsamples.mm"))

    # now, transform the text
    bow_text = dictionary.doc2bow(gensim.utils.tokenize(text))
    # show me the transformed text
    # print([(dictionary[id], count) for id, count in bow_text])

    # generate a tfidf model from the set of all articles
    tfidf = gensim.models.TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf[corpus]
    # then generate a LSI model from the set of all articles
    lsi = gensim.models.LsiModel(corpus_tfidf,
                                 id2word=dictionary,
                                 num_topics=10)

    # now, create a dense index from the set of all articles
    index_dense = MatrixSimilarity(lsi[corpus])

    # finally, let's use the input query and translate it into the lsi space.
    vec_lsi = lsi[bow_text]
    # compute the similarity index
    sims = index_dense[vec_lsi]
    # print the raw vector numbers
    # print (list(enumerate(sims)))

    # now, sort by similarity number and print the highest similar articles to the query
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    # print (sims)

    # load the file list
    file_list = pickle.load(open('models/file_list.p', 'rb'))

    # use it to show the article names

    dictSimilars = {}
    for i in range(len(sims)):
        ind = sims[i][0]
        dictSimilars[str(sims[i][1])] = file_list[ind]

    js = json.dumps(dictSimilars)
    return js
def calculate_tfidf_cos_sim(text1, text2, dictionary_tfidf, corpus):

    tfidf1 = to_tfidf(text1, dictionary_tfidf, corpus)
    tfidf2 = to_tfidf(text2, dictionary_tfidf, corpus)
    index = MatrixSimilarity([tfidf1], num_features=len(dictionary_tfidf))
    sim = index[tfidf2]
    return float(sim[0])
def calAuthorSim():
    conn = sqlite3.connect(config.db_path)
    db = conn.cursor()

    model = AuthorTopicModel.load(config.author_model128_path)
    poets = list(model.id2author.values())
    print(len(poets))
    # vec = model.get_author_topics('苏轼')
    index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30)
    index.save(config.author_simMatrix_path)
    # index = MatrixSimilarity.load(config.author_simMatrix_path)

    for name in poets:
        # print(name)
        sims = index[model[name]]
        sims = sorted(sims, key=lambda item: -item[1])
        sims = [ [poets[sim[0]] , sim[1]] for sim in sims]
        # print(sims)
        # sql_comment  = "UPDATE author SET sims=? WHERE id=?"
        # db.execute(sql_comment, (toJson(sims), name))

        sql_comment  = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name)
        db.execute(sql_comment)
        # print(sql_comment)
    # print(len(poets))
    conn.commit()
    def find_similarity_scores(self, topics):
        # Create similarities container
        similarities = {'Resumes': {}}
        # Gensim requires a corpora data structure for transformations and analysis
        dictionary = corpora.Dictionary(self.corpus)

        # Convert text to BoW.  It already is but lets be sure.
        corpus_gensim = [dictionary.doc2bow(doc) for doc in self.corpus]

        # Term Frequency-Inverse Document Frequency (TF-IDF) transformation sets weights small
        # when they appear more often in the text.
        self.tfidf = TfidfModel(corpus_gensim)
        print(self.tfidf)
        self.tfidf = self.tfidf[corpus_gensim]
        print(self.tfidf)
        # Find similarity via vector-space pair-wise cosine angle absolute value via Latent Semantic Indexing (LSI)
        # https://en.wikipedia.org/wiki/Latent_semantic_analysis#Latent_semantic_indexing
        lsi = LsiModel(self.tfidf, id2word=dictionary, num_topics=topics)
        lsi_index = MatrixSimilarity(lsi[self.tfidf])
        similarities['Resumes']["LSI_Similarity"] = np.array(
            [lsi_index[lsi[self.tfidf[i]]] for i in range(len(self.corpus))])

        for doc in self.tfidf:
            for word_id, value in doc:
                word = dictionary.get(word_id)
                self.ind_word_scores[word] = value

        # Convert to numpy arrays
        self.f_list = np.array(self.f_list)
        self.data = np.array(self.data)

        # Return results to object
        self.sim_matrix = similarities
 def train_model(self):
     """
     Read the preprocessed data and generate corpus dictionary, tfidf model and matrix(Cosine) similarity
     :return: status of training
     """
     try:
         data = pd.read_csv(self.processed_data)
         del data['Unnamed: 0']
         # creating tokens for the doc column
         corpus = data['doc'].map(break_to_tokens)
         # creating dictionary of words in the movie dataset
         dictionary = gensim.corpora.Dictionary(corpus)
         dictionary.save(self.corpus_dictionary)
         # creating vector with bag of words for the corpus
         vector = [dictionary.doc2bow(d) for d in corpus]
         # creating tfidf values for the vector
         tfidf = models.TfidfModel(vector)
         tfidf.save(self.tfidf_model)
         corpus_tfidf = tfidf[vector]
         # Compute Similarities
         similarity = MatrixSimilarity(corpus_tfidf,
                                       num_features=len(dictionary))
         similarity.save(self.matrix_similarity)
         return "Model Trained Successfully"
     except:
         return "Error While Training Model"
Exemple #12
0
    def train(self, classdict, nb_topics, *args, **kwargs):
        """ Train the topic modeler.

        :param classdict: training data
        :param nb_topics: number of latent topics
        :param args: arguments to pass to the `train` method for gensim topic models
        :param kwargs: arguments to pass to the `train` method for gensim topic models
        :return: None
        :type classdict: dict
        :type nb_topics: int
        """
        self.nb_topics = nb_topics
        self.generate_corpus(classdict)
        if self.toweigh:
            self.tfidf = TfidfModel(self.corpus)
            normcorpus = self.tfidf[self.corpus]
        else:
            self.tfidf = None
            normcorpus = self.corpus

        self.topicmodel = gensim_topic_model_dict[self.algorithm](
            normcorpus, num_topics=self.nb_topics, *args, **kwargs)
        self.matsim = MatrixSimilarity(self.topicmodel[normcorpus])

        # change the flag
        self.trained = True
Exemple #13
0
def cs_lda(sp, df, feature, session, update_model):
    print(f"Starting LDA ...")
    if update_model:
        session = session + "lda-" + feature
        if not os.path.exists(session):
            print(f"New directory: {session}")
            os.mkdir(session)
        session = session + "/"
        create_dictionary(session, df, feature)
        create_model(session, df, feature)
    else:
        session = session + "lda-" + feature + "/"
    print(f"Computing Cosine Similarity on feature {feature}")
    dct = get_dict(feature, session)
    if not feature == "title":
        corpus = common.remove_stopwords(df[feature]).tolist()
    else:
        corpus = df[feature].tolist()
    corpus = [doc.split() for doc in corpus]
    corpus = [dct.doc2bow(text) for text in corpus]
    lda = LdaMulticore.load(session + "LDA-model-" + feature)
    res = lda[corpus]
    index = MatrixSimilarity(res)

    # index.save("simIndex.index")

    def compute(text):
        vec_bow = dct.doc2bow(text.split())
        vec_lda = lda[vec_bow]
        sims = index[vec_lda]
        return sims

    results = for_pivot(df[feature], df, compute)
    common.save_as_pivot(results, sp=sp)
def cossim(query, documents):
    # Compute cosine similarity between the query and the documents.
    query = tfidf[dictionary.doc2bow(query)]
    index = MatrixSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in documents]],
        num_features=len(dictionary))
    similarities = index[query]
    return similarities
Exemple #15
0
def cos_sim(text1, text2):
    tfidf1 = to_tfidf(text1)
    tfidf2 = to_tfidf(text2)
    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim = index[tfidf2]
    # 本来sim输出是一个array,我们不需要一个array来表示,
    # 所以我们直接cast成一个float
    return float(sim[0])
Exemple #16
0
 def __init__(self,
              num_topics=100,
              wiki_tokens_path='data/token_sents.pkl',
              wiki_sents_path='data/sents.pkl',
              student_tokens_path='data/children_data.json'):
     super().__init__(wiki_tokens_path, wiki_sents_path,
                      student_tokens_path)
     self.lsi = self.compute_lsi(num_topics)
     self.lsi_index = MatrixSimilarity(self.lsi[self.wiki_tfidf_corpus])
def generate_embeddings(documents, dictionary):
    doc_word_index = MatrixSimilarity(
        [dictionary.doc2bow(document) for document in documents],
        num_features=len(dictionary))
    doc_doc_index = np.array(
        [doc_word_index[dictionary.doc2bow(doc)] for doc in documents])
    np.save('models/Word2vec/doc_word_index', doc_word_index.index)
    np.save('models/Word2Vec/doc_doc_index', doc_doc_index)
    return doc_word_index.index, doc_doc_index
Exemple #18
0
    def build_models(self):
        # Create tfidf model
        self.tfidf_model = TfidfModel(self.corpus)

        # Map bag of words to (word-index, word-weight)
        self.tfidf_corpus = list(
            map(lambda c: self.tfidf_model[c], self.corpus))

        self.tfidf_similarity = MatrixSimilarity(self.tfidf_corpus)

        self.lsi_model = LsiModel(self.tfidf_corpus,
                                  id2word=self.dictionary,
                                  num_topics=100)

        self.lsi_corpus = list(
            map(lambda c: self.lsi_model[c], self.tfidf_corpus))

        self.lsi_similarity = MatrixSimilarity(self.lsi_corpus)
Exemple #19
0
def lsi(corpus, dictionary):
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100)
    lsi_corpus = []
    for i in range(len(corpus)):
        lsi_corpus.append(lsi_model[corpus[i]])

    lsi_similarity_matrix = MatrixSimilarity(lsi_corpus)
    print(lsi_model.show_topics())
    return lsi_similarity_matrix
Exemple #20
0
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    tfidfmodel = TfidfModel(corpus)
    index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary))
    index.save(matrix_name)
    tfidfmodel.save(model_name)
    dictionary.save(dic_name)
    return tfidfmodel, index, dictionary
Exemple #21
0
def tf_idf(corpus):
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = []
    for i in range(len(corpus)):
        tfidf_corpus.append(tfidf_model[corpus[i]])

    tfidf_similarity_matrix = MatrixSimilarity(tfidf_corpus)

    return tfidf_similarity_matrix
    def load(self):
        """
        load the corpora created by `make_corpus.py`
        """
        self.corpus = MmCorpus(self.corpus_file)
        self.dictionary = Dictionary.load_from_text(self.dict_file)
        self.titles = load_titles(self.title_file)

        self.tfidf_model = TfidfModel.load(self.tfidf_model_file)
        self.index = MatrixSimilarity(self.tfidf_model[self.corpus])
Exemple #23
0
def find_similarity(vec_lsi, df, k=30):

    with open('models/LSI_model/corpus_lsi.pickle', 'rb') as handle:
        corpus_lsi = pickle.load(handle)
    index = MatrixSimilarity(corpus_lsi, num_features=72)
    sims = index[vec_lsi]
    index = sims[0].argsort()[-k:][::-1]
    for i in index:
        print(i, "------->", df[i])
    return index
def soft_cosine_similarity(text_1, text_2, corpus):
    dictionary = Dictionary(corpus)
    text_1 = dictionary.doc2bow(text_1)
    text_2 = dictionary.doc2bow(text_2)
    w2v_model = Word2Vec(corpus,
                         workers=cpu_count(),
                         min_count=1,
                         size=300,
                         seed=12345)
    similarity_matrix = sparse.csr_matrix(
        MatrixSimilarity(Dense2Corpus(w2v_model.wv.syn0.T)))
    return softcossim(text_1, text_2, similarity_matrix)
Exemple #25
0
def cos_sim(text1, text2):
    '''
    Calculate cosine similarity between two texts
    :param text1: input string
    :param text2: input string
    :return: cosine similarity
    '''
    tfidf1 = to_tfidf(text1)
    tfidf2 = to_tfidf(text2)
    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim = index[tfidf2]
    return float(sim[0])
Exemple #26
0
def news_recommend_keywords(keywords, num=10):
    keywords = [word for word in keywords.split()]
    path_df = "Pickles/News_central_rec2.pickle"

    with open(path_df, 'rb') as data:
        df = pickle.load(data)

    df['bag_of_words'] = ''
    columns = df.columns
    for index, row in df.iterrows():
        words = []
        Words = ''
        for col in columns:
            if col == 'Content':
                words += row[col].split()
        words = list(set(words))
        row['bag_of_words'] = words

    processed_keywords = df.bag_of_words.to_list()
    dictionary = Dictionary(
        processed_keywords)  # create a dictionary of words from our keywords
    corpus = [dictionary.doc2bow(doc) for doc in processed_keywords]
    #create corpus where the corpus is a bag of words for each document

    tfidf = TfidfModel(corpus)  #create tfidf model of the corpus

    # Create the similarity data structure. This is the most important part where we get the similarities between the news.
    sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
    query_doc_bow = dictionary.doc2bow(
        keywords)  # get a bag of words from the query_doc
    query_doc_tfidf = tfidf[
        query_doc_bow]  #convert the regular bag of words model to a tf-idf model where we have tuples
    # of the news ID and it's tf-idf value for the news

    similarity_array = sims[
        query_doc_tfidf]  # get the array of similarity values between our news and every other news.
    #So the length is the number of news we have. To do this, we pass our list of tf-idf tuples to sims.

    similarity_series = pd.Series(similarity_array.tolist(),
                                  index=df.Title.values)  #Convert to a Series
    top_hits = similarity_series.sort_values(
        ascending=False)[:num]  #get the top matching results,
    # i.e. most similar news

    titles = []
    scores = []
    for idx, (title, score) in enumerate(zip(top_hits.index, top_hits)):
        #print("%d '%s' with a similarity score of %.3f" %(idx+1, title, score))
        titles.append(title)
        scores.append(score)

    return titles, scores
def cossim(query, documents, dictionary, num_best=30):
    # Compute cosine similarity between the query and the documents.
    temp = []
    temp.append(query)
    input_sentence = word_tokenizer(
        remove_null_sentence(text_preprocessing(sentences=temp)))
    query = dictionary.doc2bow(input_sentence[0])
    index = MatrixSimilarity(
        [dictionary.doc2bow(document) for document in documents],
        num_features=len(dictionary))
    similarities = index[query]
    most_similar_index = similarities.argsort()[-num_best:][::-1]
    return most_similar_index
Exemple #28
0
    def index(self, corpus, mode="MatrixSimilarity"):
        if mode == "MatrixSimilarity":
            self._index = MatrixSimilarity(self.corpus,
                                           num_features=self.num_features)
        elif mode == "SparseMatrixSimilarity":
            self._index = SparseMatrixSimilarity(
                self.corpus, num_features=self.num_features)
        else:
            raise TypeError(
                "mode has to be either MatrixSimilarity or SparseMatrixSimilarity"
            )

        return self._index[corpus]
Exemple #29
0
def semantic_vector_similarity(dataset, test_data):
    """
    Mean cosine similarity to other essays’ semantic vector
    :param dataset: list
    :return:
    """
    print("semantic_vector_similarity")
    cacheStopWords = pw.words("english")
    punc = ['.', ',', '?', '!', '@', '"', 'n\'t']
    cacheStopWords.extend(punc)
    token_sets = []
    # print(cacheStopWords)
    for data in dataset:
        essay_token = [word for word in data['essay_token'] if word.lower() not in cacheStopWords]
        token_sets.append(essay_token)

    dictionary = corpora.Dictionary(token_sets)

    corpus = [dictionary.doc2bow(tokens) for tokens in token_sets]
    lsi_model = models.LsiModel(corpus, id2word=dictionary, num_topics=20)
    documents = lsi_model[corpus]
    topics = lsi_model.show_topics(num_words=5, log=0)

    Min = 9999
    Max = 0

    scores = numpy.array([sample['domain1_score'] for sample in dataset])

    index = MatrixSimilarity(documents)
    predict_score_list = []
    score_list = []
    for sample, essay in zip(test_data, corpus):
        query = essay
        query_vec = lsi_model[query]
        # print(query)
        sim = index[query_vec]

        idxs = sim.argsort()[-20:-1][::-1]
        # print(idxs)
        _sim = [sim[idx] for idx in idxs]
        _scores = [scores[idx] for idx in idxs]

        # print(sim)
        predict_score = numpy.sum(numpy.multiply(scores, sim)) / len(dataset)
        sample['semantic_vector_similarity'] = predict_score
        # print(predict_score, sample['domain1_score'])
        predict_score_list.append(predict_score)
        # score_list.append(sample['domain1_score'])
    # plt.plot(predict_score_list, score_list, 'o')
    # plt.show()
    return numpy.array(predict_score_list).reshape(-1, 1)
    def main(self):

        print("Recommendation using TF_IDF")

        # Loading preprocessed data
        vagas_ti = pd.read_csv(self.dataPrepFile)
        vagas_ids = pickle.load(
            open(self.out + "preprocessing/vagas_ids.array", "rb"))
        vagas_words = pickle.load(
            open(self.out + "preprocessing/vagas_words.list", "rb"))
        cvs_words = pickle.load(
            open(self.out + "preprocessing/cvs_words.series", "rb"))
        cvs = pd.read_csv(self.dataCvsFile)
        cvs = cvs.fillna("")
        cvs.isnull().any()
        #print("Loading cvs done!")

        # Creating a dictionary
        dictionary = gcorp.Dictionary(vagas_words)
        dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict'
                        )  # store the dictionary, for future reference

        # compile corpus (vectors number of times each elements appears)
        raw_corpus = [dictionary.doc2bow(v) for v in vagas_words]
        gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm',
                                 raw_corpus)  # store to disk
        print("Tamanho do dicionário: " + str(len(dictionary)))

        # STEP 2 : similarity between corpuses
        dictionary = gcorp.Dictionary.load(self.out +
                                           'preprocessing/tf_idf/vagas.dict')
        corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm')

        # Transform Text with TF-IDF
        tfidf = gsm.TfidfModel(corpus)  # step 1 -- initialize a model

        # corpus tf-idf
        corpus_tfidf = tfidf[corpus]

        # STEP 3 : Create similarity matrix of all files
        index = MatrixSimilarity(corpus_tfidf,
                                 num_features=len(dictionary),
                                 num_best=10)
        index.save(self.out + 'preprocessing/tf_idf/vagas.index')
        index = MatrixSimilarity.load(self.out +
                                      'preprocessing/tf_idf/vagas.index')

        self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words,
                                  dictionary, tfidf, index)

        print("Recommendation using TF_IDF done!")