Ejemplo n.º 1
0
def vars():
    mydct = load('mydct.joblib')
    noval_corp = load('noval_corp.joblib')
    noval_ind = get_tmpfile('index')
    noval_index = Similarity(noval_ind, noval_corp, len(mydct))
    val_corp = load('val_corp.joblib')
    val_ind = get_tmpfile('index')
    val_index = Similarity(val_ind, val_corp, len(mydct))
    pca8 = load('pca8.joblib')
    nlp = yelp_tool.spacy.load('en_core_web_md', disable=['tagger', 'ner'])
    read = yelp_tool.Readability()
    nlp.add_pipe(read, last=True)
    return mydct, noval_index, val_index, pca8, nlp
Ejemplo n.º 2
0
 def create_similarity_index(self):
     if not os.path.isfile(self.similarity_file):
         self.similarity_index = Similarity('./LSM/', self.corpora,
                                            self.num_topics)
         self.similarity_index.save(self.similarity_file)
     else:
         self.similarity_index = Similarity.load(self.similarity_file)
Ejemplo n.º 3
0
 def cosine_similarity_only_syn(self):
     print("Cosine Similarity with only synsets")
     cos_sim = []
     for data in self.data:
         sent1 = [word[0] for word in data[1]]
         sent2 = [word[0] for word in data[2]]
         sent3, sent4 = [], []
         for word in sent1:
             if self.preprocessdata_o.synsets.get(word):
                 sent3.append(
                     list(self.preprocessdata_o.synsets.get(word))[0])
         sent1 += sent3
         for word in sent2:
             if self.preprocessdata_o.synsets.get(word):
                 sent4.append(
                     list(self.preprocessdata_o.synsets.get(word))[0])
         sent2 += sent4
         text = [sent3] + [sent4]
         sent_dict = corpora.Dictionary(text)
         corpus = [sent_dict.doc2bow(t) for t in text]
         sim = Similarity('-Similarity-index',
                          corpus,
                          num_features=len(sent_dict))
         test_corpus_1 = sent_dict.doc2bow(sent1)
         cos_sim_each = sim[test_corpus_1][1]
         cos_sim.append(cos_sim_each)
     self.feature['cos_sim_only_syn'] = cos_sim
Ejemplo n.º 4
0
def create_document_similarity_model(alternate_path=False) -> dict():
    DATA_MODEL_NAME = "data_model.pickle"
    DICT_MODEL_NAME = "dictSim.pickle"
    INDEX_NAME = "gensim_index.pickle"

    #initial word tokenization
    if not os.path.exists(DATA_MODEL_NAME):
        print("loading data files from scratch")
        train_X, train_Y = load_robots_txt_files(alternate_path)
        save_model((train_X, train_Y), DATA_MODEL_NAME)
    else:
        print("loading data files by pickle")
        train_X, train_Y = load_model(DATA_MODEL_NAME)

    #create gensim dictionary
    if not os.path.exists(DICT_MODEL_NAME):
        print("loading gensim dict from scratch")
        gensim_dict = Dictionary(train_X)
        save_model(gensim_dict, DICT_MODEL_NAME)
    else:
        print("loading gensim dict with pickle")
        gensim_dict = load_model(DICT_MODEL_NAME)

    #create lookable index
    if not os.path.exists(INDEX_NAME):
        print("building index from scratch")
        iterator = tqdm(map(lambda x: gensim_dict.doc2bow(x), train_X))
        index = Similarity("gensim_index.models",
                           corpus=iterator,
                           num_features=len(gensim_dict) + 1,
                           num_best=100)
        save_model(index, INDEX_NAME)
    else:
        print("loading index with pickle")
        index = load_model(INDEX_NAME)
Ejemplo n.º 5
0
def get_bow(graph, with_children=False):
    docs = []

    for vertex in graph.vertices():
        articles_text = ""
        for article in graph.vp.articles[vertex]:
            articles_text = articles_text + article

        docs.append(articles_text.split())

    # create & save a dictionary

    # # remove common words and tokenize
    stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document if word not in stoplist]
             for document in docs]
    #
    # # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]
    dictionary = corpora.Dictionary(texts)
    for vertex in graph.vertices():
        category_articles = string.join(graph.vp.articles[vertex]).split()
        graph.vp.bow[vertex] = dictionary.doc2bow(category_articles)
    dictionary.save('/tmp/bag_of_words.dict')
    corpus = [graph.vp.bow[vertex] for vertex in graph.vertices()]
    index = Similarity('/tmp/tst',
                       corpus=corpus,
                       num_features=dictionary.__len__())
Ejemplo n.º 6
0
def check(docs, target):
    """
    Calculate the similarity between target and docs.

    Parameters
    ----------
    docs: list
        A list of strings to be compared against
    target: string
        The target string to be compared

    Returns
    -------
    float
        The percentage similarity

    """
    stemmer = PorterStemmer()

    tok_docs = [tokenize(text) for text in docs]
    stem_docs = [[stemmer.stem(word) for word in doc] for doc in tok_docs]

    dictionary = Dictionary(stem_docs)
    corpus = [dictionary.doc2bow(doc) for doc in stem_docs]
    tfidf = TfidfModel(corpus)
    sims = Similarity('/tmp/sims.index',
                      tfidf[corpus],
                      num_features=len(dictionary))

    query = [stemmer.stem(word) for word in tokenize(target)]
    query_bow = dictionary.doc2bow(query)
    query_tfidf = tfidf[query_bow]

    return sum(sims[query_tfidf]) / len(sims[query_tfidf])
Ejemplo n.º 7
0
    def build_index(
        self, premises: Iterable[Sentence]
    ) -> Tuple[Similarity, Callable[[TokenList], Vector], Iterable[Sentence]]:
        """Builds an index from given premises that can be used to answer similarity queries."""

        if Irsel.index_cache:
            # if an index has already been built for these TF-IDF parameters, reuse it
            cached_smart, cached_dimensions, cached_index, cached_query_transformer, cached_premises = Irsel.index_cache
            if cached_smart == self.smart and cached_dimensions == self.dimensions and cached_premises is premises:
                printq("Hitting index cache.")
                return cached_index, cached_query_transformer, cached_premises
            else:
                printq("Skipping index cache.")

        dictionary, corpus = self.build_corpus(
            premises)  # create a term-document matrix
        corpus, query_transformer = self.transform_corpus(
            dictionary, corpus)  # apply TF-IDF and LSI models

        with Message("Storing index"):
            # Builds an index which we can compare queries against.
            index = Similarity(get_tmpfile(f"irsel_index"),
                               corpus,
                               num_features=len(dictionary))
        printq(index)

        # allows us to reuse this index for later proof attempts with the same parameters
        Irsel.index_cache = self.smart, self.dimensions, index, query_transformer, premises
        return index, query_transformer, premises
Ejemplo n.º 8
0
def find_answer(question):
    # 对输入的问题进行分词
    question.replace('\t', '').replace(' ', '')  # .replace('\n', '')
    question_gen = jieba.cut(question)
    questionList = list(question_gen)
    question_seg = " ".join(questionList)
    print(question_seg)
    print(question, question_gen, questionList, question_seg)
    answerList = []

    # 判断问题是否可以用知识库数据解决
    if is_KB_QA(question_seg):
        print("Is KB QA:")
        info_list = KB_answer(questionList)
        for answer in info_list:
            answerDic = {}
            answerDic["answer"] = answer[2] + "为" + answer[3]
            answerDic["percentage"] = (int)(answer[0] * 100)
            answerList.append(answerDic)
    # 如果答案列表为空,在以回答的问题中寻找相似答案
    if not answerList:
        print("Is not KB QA:")
        # 建立问题和回答的字典
        dic = {}
        question, answer = getSellerQA(item_id)
        #with open(SENTENCE_PATH, "r", encoding="utf-8") as question:
        #    with open(ANSWER_PATH, "r", encoding="utf-8") as answer:
        for q, a in zip(question, answer):
            dic[q] = a
        # 读取已经完成分词的语料库
        sentences = []
        for line in question:
            line.replace('\t', '').replace(' ', '')  # .replace('\n', '')
            seg_list = jieba.cut(line)
            sentences.append(list(seg_list))
        print('input done')
        # 生成字典和向量语料
        #pprint(sentences)
        dictionary = corpora.Dictionary(sentences)
        corpus = [dictionary.doc2bow(text) for text in sentences]
        index = Similarity('-Similarity-index', corpus, num_features=400)
        print("training done:", list(question_gen))
        # 找到与提出的问题最相似的已有问题
        resultList = find_simillar(questionList, dictionary, index)
        # 将得到的答案整合到一个List中并返回
        for answer in resultList:
            answerDic = {}
            # answerList.append(''.join(sentences[answer[0]]))
            answerDic["answer"] = dic[''.join(sentences[answer[0]])]
            answerDic["percentage"] = (int)(answer[1] * 100)
            answerList.append(answerDic)
            #answerList.append(dic[''.join(sentences[answer[0]])])
            #print(dic[''.join(sentences[answer[0]])])
        print(resultList)
    reDic = {}
    reDic["answer"] = answerList
    reDic["cnt"] = len(answerList)
    print(reDic)
    return reDic
Ejemplo n.º 9
0
def train_and_save_indexer(corpus, dct, file_name='model_100_indexer.model'):
    index_temp = get_tmpfile("index")
    indexer = Similarity(output_prefix=index_temp,
                         corpus=corpus,
                         num_features=len(dct),
                         num_best=6)
    indexer.save(file_name)
    return indexer
Ejemplo n.º 10
0
 def similarity(self,sent1,sent2):
     text1 = self.wordTokenize(sent1)
     text2 = self.wordTokenize(sent2)
     texts = [text1, text2]
     dictionary = corpora.Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary))
     return similarity[dictionary.doc2bow(text1)][1]
 def similarity_matrix(self, corpus, dictionary):
     """Compute cosine similarity against a corpus of documents by storing the index matrix in memory."""
     # index = MatrixSimilarity(corpus, num_features=len(dictionary))
     index_temp = get_tmpfile("index")
     index = Similarity(index_temp, corpus,
                        num_features=len(dictionary))  # create index
     for sims in index[corpus]:
         pprint(sims)
Ejemplo n.º 12
0
def get_sim(f1, f2):
    c1 = open(f1, encoding='utf8').read()
    c1 = removePunctuation(c1)
    print(c1)
    # jieba 进行分词
    data1 = jieba.cut(c1)
    data11 = ""
    # 获取分词内容
    for i in data1:
        data11 += i + " "
    doc1 = [data11]
    # 检验分词,程序成功可去掉
    print("分词内容:\n")
    print(doc1)

    t1 = [[word for word in doc.split()] for doc in doc1]
    # print(t1)

    #  frequence频率
    freq = defaultdict(int)
    for i in t1:
        for j in i:
            freq[j] += 1
    # print(freq)

    # 限制词频
    '''t2 = [[token for token in k if freq[j] >= 3]
        for k in t1]
    '''

    # corpora语料库建立字典

    dic1 = corpora.Dictionary(t1)

    # 对比文件
    c2 = open(f2, encoding='utf8').read()
    c2 = removePunctuation(c2)

    # jieba 进行分词
    data2 = jieba.cut(c2)
    data21 = ""
    for i in data2:
        data21 += i + " "
    new_doc = data21
    # print(new_doc)
    # doc2bow把文件变成一个稀疏向量
    new_vec = dic1.doc2bow(new_doc.split())
    # 对字典进行doc2bow处理,得到新语料库
    new_corpor = [dic1.doc2bow(t3) for t3 in t1]
    # 特征数
    featurenum = len(dic1.token2id)
    # SparseMatrixSimilarity 稀疏矩阵相似度
    idx = Similarity('-Similarity-index', new_corpor, featurenum)
    sims = idx[new_corpor]
    f = open(r'/output.txt', 'w')
    print('%.2f' % sims, file=f)
    f.close()
    print('%.2f' % sims)
Ejemplo n.º 13
0
def main():
    stopword = open('D:\code/test\哈工大停用词表.txt', encoding='utf8')  # 获取停用词列表
    stopwordlist = list(jieba.cut(stopword.read()))
    print(stopwordlist)
    try:
        orig_path, add_path, save_path = sys.argv[1:4]
    except Exception as e:
        print(sys.argv)
        print(e)
   # save_path = 'D:\code/test/out.txt'
    # 源文本预处理
  #  orig_path = 'D:\code/test/orig.txt'
    orig_file = open(orig_path, 'r', encoding="utf-8")
    text = orig_file.read()
    text = remove_punctuation(text)
    text = list(text)
    afterswlis = []
    for each in text:
        if each not in stopwordlist:
            afterswlis.append(each)
        else:
            continue
    text = afterswlis
    text ="".join(text)
    orig_file.close()
    # 预处理查重文本
   # add_path = 'D:\code/test/orig_0.8_dis_15.txt'
    add_file = open(add_path, 'r', encoding="utf-8")
    add_text = add_file.read()
    add_file.close()
    add_text = remove_punctuation(add_text)
    add_text = list(add_text)
    afterswlis = []
    for each in add_text:
        if each not in stopwordlist:
            afterswlis.append(each)
        else:
            continue
    add_text = afterswlis
    add_text = "".join(add_text)
    # 文本转向量
    texts = [jieba.lcut(text)]
    dictionary = corpora.Dictionary(texts)
    num_features = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    add_vec = dictionary.doc2bow(jieba.lcut(add_text))
    # 向量计算相似度
    similarity = Similarity('-Similarity-index', corpus, num_features)
    # 转换类型,切片保留两位小数
    a = similarity[add_vec]
    b = a[0]
    b = str(b).split('.')[0] + '.' + str(a).split('.')[1][:2]
    print("相似的计算结果:%s" % b)
    # 输出结果写入指定文档
    f = open(save_path, 'w', encoding="utf-8")
    f.write("相似的计算结果:%s" % b)
    f.close()
Ejemplo n.º 14
0
 def GetLsm(self, dictionary, corpus):
     lsi = models.lsimodel.LsiModel(
         corpus, id2word=dictionary)  #num_topics=len(corpus)/2
     vec_lsi = lsi[corpus[0]]
     index = Similarity('l_index', corpus, len(dictionary))
     cnt = 0
     for similarities in index:
         if cnt == 1:
             return list(enumerate(similarities))
         cnt += 1
Ejemplo n.º 15
0
 def tf_text2vector(self):
     try:
         dct = self.tf_parameters["tf_dictionary"]
         rules, corpus =  zip(*self.tf_parameters["tf_rules_corpus"])
         txt_corp = dct.doc2bow(self.lemm_txt.split())
         index = Similarity(None, corpus, num_features=len(dct)) 
         rules_similarity = list(zip(rules, index[txt_corp]))
         return rules_similarity
     except:
         return None
Ejemplo n.º 16
0
 def GetTfidf(self, dictionary, corpus):
     tfidf = models.TfidfModel(corpus)
     vec_lsi = tfidf[corpus[0]]
     index = Similarity('t_index', corpus, len(dictionary))
     #tsims = index[vec_lsi]
     cnt = 0
     for similarities in index:
         if cnt == 1:
             return list(enumerate(similarities))
         cnt += 1
Ejemplo n.º 17
0
def get_sim(model, corps):
    """get Similarity for corpus and model

    Args:
        model (TfIdfModel): TfIdf model to develop Similarity
        corps (Dictionary): Dictionary of words 

    Returns:
        [type]: [description]
    """
    return Similarity(None, model[corps], num_features=400)
Ejemplo n.º 18
0
 def __init__(self, loader_obj):
     self.model_types = [("lsi", None)]
     self.model = loader_obj
     self.tknz = TokenizerApply(self.model)
     self.tkz_model = self.tknz.model_tokenize()
     self.et_vectors = self.tkz_model.application_field["texts"]
     self.coeffs = self.tkz_model.application_field["coeff"]
     self.tags = self.tkz_model.application_field["tags"]
     self.index = Similarity(
         None,
         self.et_vectors,
         num_features=self.model.texts_algorithms["num_topics"])
Ejemplo n.º 19
0
def main(path="train.json"):
    #get a random question
    quest = select_question(path)
    print("Random question : ")
    print(quest)
    #Tokenize and create gensim dictionnary
    dictionary, corpus_quest = process_question(quest)
    tfidf = gensim.models.TfidfModel(corpus_quest)
    #corpus of contexts processing
    ctx = import_context(path)
    corpus = process_contexts(ctx)
    #Global corpus dictionnary
    corpus = final_process_context(corpus, dictionary)
    dir_for_index = get_tmpfile("index_sim")
    #Similarity function to compare each context to the question
    sim = Similarity(dir_for_index, corpus, num_features=len(dictionary))
    #result list of similarity scores
    res = (sim[corpus_quest].tolist()[0])

    #Get 3 best most similar context from result list
    max_index = sorted(range(len(res)), key=lambda sub: res[sub])[-3:]
    #create dict of index (to be able able to find the context in the context list) and similarity value
    dict_best = {}
    for e in max_index:
        dict_best[e] = res[e]
    #get index of best falue
    best_index = max(dict_best, key=dict_best.get)
    print("Best context")
    print(ctx[best_index])
    #use function sim_metric to find out if it is the appropriate context (it will return 1)
    sim_accuracy = sim_metric(quest, ctx[best_index])
    print("similarity metric", sim_accuracy)

    #Look for other good solutions if the first option is not satisfactory
    #top3metric tells if there is a adequate solution in the 3 most similar context returned
    #if the first example was a good fit , it will automatically return 1
    top3metric = sim_accuracy
    if sim_accuracy == 0:
        other_solutions_index = []
        for j, value in dict_best.items():
            if (j != best_index):
                other_solutions_index.append(j)
        if len(other_solutions_index) != 0:
            print("Autres solutions possibles")
            for k in other_solutions_index:
                #print(ctx[k])
                metric = sim_metric(quest, ctx[k])
                if (metric == 1):
                    top3metric = 1
                print("similarity metric", metric)

    return [sim_accuracy, top3metric]
Ejemplo n.º 20
0
def getSimilarity(df_content_o):
    logging.debug('preparing docSim')
    raw_documents = list(df_content_o['content'])
    corpora_documents = []
    for item_text in raw_documents:
        item_str = item_text.split(' ')
        corpora_documents.append(item_str)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    nf=len(set(itertools.chain.from_iterable(corpora_documents)))+1
    similarity = Similarity('-Similarity-index', corpus, num_features=nf)#!!!!!!!!!!!!!!!!!!!!!
    similarity.num_best = max_similar_num
    return similarity,dictionary
Ejemplo n.º 21
0
 def tfidf_text2vector(self):
     try:
         dct = self.tfidf_parameters["tf_idf_dictionary"]
         tfidf_model = self.tfidf_parameters["tfidf_model"]
         rules, corpus =  zip(*self.tfidf_parameters["tf_idf_rules_corpus"])
         txt_corp = dct.doc2bow(self.lemm_txt.split())
         txt_tf_idf_vect = tfidf_model[txt_corp]
         corpus_tf_idf_vects = [tfidf_model[x] for x in corpus]
         index = Similarity(None, corpus_tf_idf_vects, num_features=len(dct)) 
         rules_similarity = list(zip(rules, index[txt_tf_idf_vect]))
         return rules_similarity
     except:
         return None
Ejemplo n.º 22
0
 def __init__(self, initializer):
     preprocessed_documents = initializer.getPreprocessedDocuments()
     dictionary = initializer.getDictionary()
     corpus = [dictionary.doc2bow(text) for text in preprocessed_documents]
     tf_idf = initializer.getTfIdf()
     query_doc_tf_idf = tf_idf[dictionary.doc2bow(
         preprocessed_documents[0])]
     similarity_object = Similarity('tfidf',
                                    tf_idf[corpus],
                                    num_features=len(dictionary))
     similarities = similarity_object[query_doc_tf_idf]
     similarity_object.destroy()
     self.scores = similarities[1:len(similarities)]
Ejemplo n.º 23
0
 def lsi_indexes_fill(self):
     try:
         dct = self.kwargs["lsi_parameters"]["dictionary"]
         lsi_model = self.kwargs["lsi_parameters"]["model"]
         rules, corpus = zip(*self.kwargs["lsi_parameters"]["rules_corpus"])
         txt_corp = dct.doc2bow(self.lemm_txt.split())
         txt_vect = lsi_model[txt_corp]
         corpus_vects = [lsi_model[x] for x in corpus]
         index = Similarity(None, corpus_vects, num_features=self.kwargs["lsi_parameters"]["num_topics"])
         rules_similarity = list(zip(rules, index[txt_vect]))
         return rules_similarity
     except:
         return None
Ejemplo n.º 24
0
def tdif_metrics(corpus_path: str) -> None:
    prep = DSSMPrepare()
    raw_ques, raw_docs, rels = prep.from_one_corpus(corpus_path)

    docs = [[w.lower() for w in word_tokenize(text)]
            for text in raw_docs.values()]
    ques = [[w.lower() for w in word_tokenize(text)]
            for text in raw_ques.values()]
    docs = docs + ques
    dictionary = Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    tf_idf = TfidfModel(corpus)

    right = {}
    for did, doc_text in raw_docs.items():
        dense_input = [w.lower() for w in word_tokenize(doc_text)]
        dense_input = dictionary.doc2bow(dense_input)
        dense_input = tf_idf[dense_input]
        right[did] = dense_input

    left = {}
    for qid, ques_text in raw_ques.items():
        dense_input = [w.lower() for w in word_tokenize(doc_text)]
        dense_input = dictionary.doc2bow(dense_input)
        dense_input = tf_idf[dense_input]
        left[qid] = dense_input

    relations = pd.DataFrame(rels, columns=['label', 'id_left', 'id_right'])
    res = {}
    res['MAP'] = 0.0
    res['NDCG@3'] = 0.0
    res['NDCG@5'] = 0.0
    num_valid = 0
    for group in relations.groupby('id_left'):
        qid, data = group
        dids = data['id_right'].values.tolist()
        labels = data['label'].values.tolist()
        c = [right[did] for did in dids]
        sims = Similarity('tf_idf', tf_idf[c], num_features=len(dictionary))
        scores = sims[left[qid]]
        rank = list(zip(labels, scores))
        random.shuffle(rank)
        rank = sorted(rank, key=lambda x: x[1], reverse=True)
        rank = [float(r[0]) for r in rank]
        res['MAP'] += average_precision(rank)
        res['NDCG@3'] += ndcg_at_k(rank, 3)
        res['NDCG@5'] += ndcg_at_k(rank, 5)
        num_valid += 1

    click.echo('\t'.join([f"{k}={v / num_valid:.3f}" for k, v in res.items()]))
Ejemplo n.º 25
0
def calculateAverageSimilarity(singleDocument, arrayOfDocuments):
    gen_docs = [
        stemAndTokenizeArray(document) for document in arrayOfDocuments
    ]
    dictionary = Dictionary(gen_docs)
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = TfidfModel(corpus)
    sims = Similarity('/tmp/', tf_idf[corpus], num_features=len(dictionary))
    query_doc = stemAndTokenizeArray(singleDocument)
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]

    similarities = sims[query_doc_tf_idf]
    return sum(similarities) / len(similarities)
Ejemplo n.º 26
0
def main():
  orig_qns = [qn.strip()for qn in open('data/questions.txt')]
  aug = [qn.strip() for qn in open('data/augmented.txt')]
  all_qns = []
  for idx, qn in tqdm(enumerate(orig_qns)):
    all_qns.append(qn)
    if aug[idx] != qn:
      all_qns.append(aug[idx])
  print("Combined original questions and augmented questions")
  pickle.dump(all_qns, open("precompute/questions.pkl", 'wb'))

  qns = pickle.load(open("precompute/questions.pkl", 'rb'))
  documents = []
  for qn in tqdm(qns):
    document = get_similar.preprocess_text(qn)
    if len(document) < 1:
      document = ['UNK']
    documents.append(document)

  print(f"Finished preprocessing {len(documents)} questions")
  pickle.dump(documents, open("precompute/documents.pkl", "wb"))
  print("Saved tokens to documents.pkl")
  documents = pickle.load(open("precompute/documents.pkl", "rb"))
  
  dct = corpora.Dictionary(documents)
  pickle.dump(dct, open("precompute/dct.pkl", 'wb'))
  dct.save('precompute/dct.dict')
  dct = corpora.Dictionary.load('precompute/dct.dict')
  
  corpus = [dct.doc2bow(doc) for doc in tqdm(documents)]
  pickle.dump(corpus, open("precompute/corpus.pkl", 'wb'))
  print("Corpus generated")

  tfidf = models.TfidfModel(corpus, smartirs='bfn')
  pickle.dump(tfidf, open("precompute/tfidf_model.pkl", 'wb'))
  corpus_tfidf = tfidf[corpus]
  pickle.dump(corpus_tfidf, open("precompute/corpus_tfidf.pkl", 'wb'))
  print("tfidf generated")

  index_temp = get_tmpfile("index")
  index = Similarity(index_temp, corpus_tfidf, num_features=len(dct), num_best=100)
  index.save("precompute/similarities.pkl")
  print("Similarity index saved")

  PIPE = subprocess.PIPE
  #NLU = subprocess.Popen(['rasa', 'train', '--data', ' nlu-train-data', '--fixed-model-name', 'model', '-vv', 'nlu'], stdout=PIPE, stderr=PIPE)
  NLU = subprocess.Popen(['rasa', 'train', 'nlu', '-u', 'nlu-train-data', '--config', 'config.yml', '--fixed-model-name', 'model'])
  NLU.wait()
  print("Rasa NLU trained")
Ejemplo n.º 27
0
 def cosine_similarity_no_syn(self):
     print("Cosine Similarity without synsets")
     cos_sim = []
     for data in self.data:
         sent1 = [word[0] for word in data[1]]
         sent2 = [word[0] for word in data[2]]
         text = [sent1] + [sent2]
         sent_dict = corpora.Dictionary(text)
         corpus = [sent_dict.doc2bow(t) for t in text]
         sim = Similarity('-Similarity-index',
                          corpus,
                          num_features=len(sent_dict))
         test_corpus_1 = sent_dict.doc2bow(sent1)
         cos_sim_each = sim[test_corpus_1][1]
         cos_sim.append(cos_sim_each)
     self.feature['cos_sim_no_syn'] = cos_sim
Ejemplo n.º 28
0
def main(dataset_path):
    if not os.path.exists('../data/retriever/paragraph-ids.txt'):
        print('Writing paragraph ID to file...')
        with open('../data/retriever/paragraph-ids.txt', 'w') as f:
            for paragraph_id in load_ids(dataset_path):
                f.write(paragraph_id + '\n')

    dictionary_path = '../data/retriever/dct.pkl'
    if not os.path.exists(dictionary_path):
        print('Creating dictionary...')
        st = time.time()
        dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000)
        dct.save(dictionary_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating dictionary in {et - st}s.')
    else:
        print('Loading dictionary...')
        dct = Dictionary.load(dictionary_path)
        print('Dictionary loaded.')

    tfidf_path = '../data/retriever/tfidf.pkl'
    if not os.path.exists(tfidf_path):
        print('Creating model...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        model = TfidfModel(corpus)
        model.save(tfidf_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating model in {et - st}s.')
    else:
        print('Loading model...')
        model = TfidfModel.load(tfidf_path)
        print('Model loaded.')

    index_path = '../data/retriever/indexes/master-index'
    if not os.path.exists(index_path):
        print('Creating index...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        index = Similarity('../data/retriever/indexes/index', model[corpus],
                           len(dct))
        index.save(index_path)
        et = time.time()
        print(f'\rFinished creating index in {et - st}s.')
        print('Done')
    else:
        print('Nothing to do. Exiting...')
Ejemplo n.º 29
0
def get_docsim_feature(contents, remarks=""):

    dictionary_path = Config.cache_dir + "/docsim/dic_%s.pkl" % remarks
    corpus_path = Config.cache_dir + "/docsim/corpus_%s.pkl" % remarks
    corpora_documents = []
    tokenizer = Tokenizer()
    for item_text in contents:
        item_str = tokenizer(item_text)
        corpora_documents.append(item_str)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    similarity = Similarity('-Similarity-index', corpus, num_features=300)
    similarity.num_best = 3
    pickle.dump(dictionary, open(dictionary_path, "wb"), protocol=4)
    pickle.dump(corpus, open(corpus_path, "wb"), protocol=4)

    return similarity, corpus
Ejemplo n.º 30
0
def create_sim_matrix(tfidf, corpus, dictionary, outputDir):
    """"
    Creates a Gensim simiariry matrix for document similarity comparison and saves it
    
    tfidf (Gensim tfidf model): Gensim tfidf model
    corpus (Gensim corpus object): Gensim corpus
    dictionary (Gensim dictionary object): Gensim dictionary
    outputDir (string): Location to save matrix
    """
    indicesFile = outputDir + 'indices'
    simFile = outputDir + 'Index'
    sims = Similarity(indicesFile,
                      tfidf[corpus],
                      num_features=(len(dictionary)))
    sims.close_shard()
    sims.save(simFile)
    print('Similarity matrix created and stored at: ' + simFile)