Ejemplo n.º 1
0
 def create_similarity_index(self):
     if not os.path.isfile(self.similarity_file):
         self.similarity_index = Similarity('./LSM/', self.corpora,
                                            self.num_topics)
         self.similarity_index.save(self.similarity_file)
     else:
         self.similarity_index = Similarity.load(self.similarity_file)
Ejemplo n.º 2
0
def train_and_save_indexer(corpus, dct, file_name='model_100_indexer.model'):
    index_temp = get_tmpfile("index")
    indexer = Similarity(output_prefix=index_temp,
                         corpus=corpus,
                         num_features=len(dct),
                         num_best=6)
    indexer.save(file_name)
    return indexer
Ejemplo n.º 3
0
class LSM:
    def __init__(self, model_name, index):
        self.model_name = model_name

        if self.model_name == 'LSI':
            self.model_file = lsi_model_file
            self.corpora_file = lsi_corpora_file
            self.similarity_file = lsi_sim_file
            self.num_topics = LSI_TOPICS
        elif self.model_name == 'LDA':
            self.model_file = lda_model_file
            self.corpora_file = lda_corpora_file
            self.similarity_file = lda_sim_file
            self.num_topics = LDA_TOPICS

        if not os.path.isfile(mm_corpus_file) or not os.path.isfile(dict_file):
            self.corpus = CorpusConnector(index)
            corpora.MmCorpus.serialize(mm_corpus_file, self.corpus)
            self.corpus.save_dict()
            self.dictionary = self.corpus.dictionary
        else:
            self.dictionary = corpora.Dictionary.load(dict_file)
            self.corpus = corpora.MmCorpus(mm_corpus_file)

        self.model = None
        self.corpora = None
        self.similarity_index = None

    def create_model(self):
        if not os.path.isfile(self.model_file):
            if self.model_name == 'LSI':
                self.model = lsimodel.LsiModel(corpus = self.corpus, \
                        id2word = self.dictionary, num_topics = self.num_topics)
            else:
                self.model = ldamodel.LdaModel(corpus = self.corpus, \
                        num_topics = self.num_topics, id2word = self.dictionary)
            self.model.save(self.model_file)

            self.corpora = self.model[self.corpus]
            corpora.MmCorpus.serialize(self.corpora_file, self.corpora)
        else:
            self.corpora = gensim.corpora.MmCorpus(self.corpora_file)
            if self.model_name == 'LSI':
                self.model = gensim.models.LsiModel.load(self.model_file)
            else:
                self.model = gensim.models.LdaModel.load(self.model_file)

    def create_similarity_index(self):
        if not os.path.isfile(self.similarity_file):
            self.similarity_index = Similarity('./LSM/', self.corpora,
                                               self.num_topics)
            self.similarity_index.save(self.similarity_file)
        else:
            self.similarity_index = Similarity.load(self.similarity_file)
Ejemplo n.º 4
0
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
Ejemplo n.º 5
0
def vars():
    mydct = load('mydct.joblib')
    noval_corp = load('noval_corp.joblib')
    noval_ind = get_tmpfile('index')
    noval_index = Similarity(noval_ind, noval_corp, len(mydct))
    val_corp = load('val_corp.joblib')
    val_ind = get_tmpfile('index')
    val_index = Similarity(val_ind, val_corp, len(mydct))
    pca8 = load('pca8.joblib')
    nlp = yelp_tool.spacy.load('en_core_web_md', disable=['tagger', 'ner'])
    read = yelp_tool.Readability()
    nlp.add_pipe(read, last=True)
    return mydct, noval_index, val_index, pca8, nlp
Ejemplo n.º 6
0
 def __init__(self, initializer):
     preprocessed_documents = initializer.getPreprocessedDocuments()
     dictionary = initializer.getDictionary()
     corpus = [dictionary.doc2bow(text) for text in preprocessed_documents]
     tf_idf = initializer.getTfIdf()
     query_doc_tf_idf = tf_idf[dictionary.doc2bow(
         preprocessed_documents[0])]
     similarity_object = Similarity('tfidf',
                                    tf_idf[corpus],
                                    num_features=len(dictionary))
     similarities = similarity_object[query_doc_tf_idf]
     similarity_object.destroy()
     self.scores = similarities[1:len(similarities)]
Ejemplo n.º 7
0
def getSimilarity(df_content_o):
    logging.debug('preparing docSim')
    raw_documents = list(df_content_o['content'])
    corpora_documents = []
    for item_text in raw_documents:
        item_str = item_text.split(' ')
        corpora_documents.append(item_str)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    nf=len(set(itertools.chain.from_iterable(corpora_documents)))+1
    similarity = Similarity('-Similarity-index', corpus, num_features=nf)#!!!!!!!!!!!!!!!!!!!!!
    similarity.num_best = max_similar_num
    return similarity,dictionary
Ejemplo n.º 8
0
def main():
  orig_qns = [qn.strip()for qn in open('data/questions.txt')]
  aug = [qn.strip() for qn in open('data/augmented.txt')]
  all_qns = []
  for idx, qn in tqdm(enumerate(orig_qns)):
    all_qns.append(qn)
    if aug[idx] != qn:
      all_qns.append(aug[idx])
  print("Combined original questions and augmented questions")
  pickle.dump(all_qns, open("precompute/questions.pkl", 'wb'))

  qns = pickle.load(open("precompute/questions.pkl", 'rb'))
  documents = []
  for qn in tqdm(qns):
    document = get_similar.preprocess_text(qn)
    if len(document) < 1:
      document = ['UNK']
    documents.append(document)

  print(f"Finished preprocessing {len(documents)} questions")
  pickle.dump(documents, open("precompute/documents.pkl", "wb"))
  print("Saved tokens to documents.pkl")
  documents = pickle.load(open("precompute/documents.pkl", "rb"))
  
  dct = corpora.Dictionary(documents)
  pickle.dump(dct, open("precompute/dct.pkl", 'wb'))
  dct.save('precompute/dct.dict')
  dct = corpora.Dictionary.load('precompute/dct.dict')
  
  corpus = [dct.doc2bow(doc) for doc in tqdm(documents)]
  pickle.dump(corpus, open("precompute/corpus.pkl", 'wb'))
  print("Corpus generated")

  tfidf = models.TfidfModel(corpus, smartirs='bfn')
  pickle.dump(tfidf, open("precompute/tfidf_model.pkl", 'wb'))
  corpus_tfidf = tfidf[corpus]
  pickle.dump(corpus_tfidf, open("precompute/corpus_tfidf.pkl", 'wb'))
  print("tfidf generated")

  index_temp = get_tmpfile("index")
  index = Similarity(index_temp, corpus_tfidf, num_features=len(dct), num_best=100)
  index.save("precompute/similarities.pkl")
  print("Similarity index saved")

  PIPE = subprocess.PIPE
  #NLU = subprocess.Popen(['rasa', 'train', '--data', ' nlu-train-data', '--fixed-model-name', 'model', '-vv', 'nlu'], stdout=PIPE, stderr=PIPE)
  NLU = subprocess.Popen(['rasa', 'train', 'nlu', '-u', 'nlu-train-data', '--config', 'config.yml', '--fixed-model-name', 'model'])
  NLU.wait()
  print("Rasa NLU trained")
Ejemplo n.º 9
0
def initiate_recommender():
    # Retrieve all the necessary files for the recommender system
    baseDir = settings.BASE_DIR

    # Load dictionary and corpus
    dictFile = baseDir + "/static/data/DBLP_Dictionary.dict"
    corpusFile = baseDir + "/static/data/DBLP_Corpus.mm"

    dictionary = corpora.Dictionary.load(dictFile)
    corpus = corpora.MmCorpus(corpusFile)

    # Load the TF-IDF model
    tfidfFile = baseDir + "/static/data/TF-IDF"

    tfidf = models.TfidfModel().load(tfidfFile)

    # Load the Gensim similarity index
    indexFile = baseDir + "/static/data/Index"
    sims = Similarity.load(indexFile)

    # If matrix fits in memory, use this instead and comment out previous two lines
    #sims = MatrixSimilarity(tfidf[corpus], num_features=(len(dictionary)))

    # Point to the text csv file
    textFile = baseDir + "/static/data/Text.csv"

    # Load ID dataframe from recommender
    paperIDs = baseDir + "/static/data/AbsID.csv"
    cols = ["paperID"]
    dfIDs = pd.read_csv(paperIDs, names=cols, header=None)

    return dictionary, corpus, tfidf, sims, textFile, dfIDs
Ejemplo n.º 10
0
    def build_index(
        self, premises: Iterable[Sentence]
    ) -> Tuple[Similarity, Callable[[TokenList], Vector], Iterable[Sentence]]:
        """Builds an index from given premises that can be used to answer similarity queries."""

        if Irsel.index_cache:
            # if an index has already been built for these TF-IDF parameters, reuse it
            cached_smart, cached_dimensions, cached_index, cached_query_transformer, cached_premises = Irsel.index_cache
            if cached_smart == self.smart and cached_dimensions == self.dimensions and cached_premises is premises:
                printq("Hitting index cache.")
                return cached_index, cached_query_transformer, cached_premises
            else:
                printq("Skipping index cache.")

        dictionary, corpus = self.build_corpus(
            premises)  # create a term-document matrix
        corpus, query_transformer = self.transform_corpus(
            dictionary, corpus)  # apply TF-IDF and LSI models

        with Message("Storing index"):
            # Builds an index which we can compare queries against.
            index = Similarity(get_tmpfile(f"irsel_index"),
                               corpus,
                               num_features=len(dictionary))
        printq(index)

        # allows us to reuse this index for later proof attempts with the same parameters
        Irsel.index_cache = self.smart, self.dimensions, index, query_transformer, premises
        return index, query_transformer, premises
Ejemplo n.º 11
0
    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix +
                                                '_similarity.index',
                                                mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Ejemplo n.º 12
0
 def cosine_similarity_only_syn(self):
     print("Cosine Similarity with only synsets")
     cos_sim = []
     for data in self.data:
         sent1 = [word[0] for word in data[1]]
         sent2 = [word[0] for word in data[2]]
         sent3, sent4 = [], []
         for word in sent1:
             if self.preprocessdata_o.synsets.get(word):
                 sent3.append(
                     list(self.preprocessdata_o.synsets.get(word))[0])
         sent1 += sent3
         for word in sent2:
             if self.preprocessdata_o.synsets.get(word):
                 sent4.append(
                     list(self.preprocessdata_o.synsets.get(word))[0])
         sent2 += sent4
         text = [sent3] + [sent4]
         sent_dict = corpora.Dictionary(text)
         corpus = [sent_dict.doc2bow(t) for t in text]
         sim = Similarity('-Similarity-index',
                          corpus,
                          num_features=len(sent_dict))
         test_corpus_1 = sent_dict.doc2bow(sent1)
         cos_sim_each = sim[test_corpus_1][1]
         cos_sim.append(cos_sim_each)
     self.feature['cos_sim_only_syn'] = cos_sim
Ejemplo n.º 13
0
def create_document_similarity_model(alternate_path=False) -> dict():
    DATA_MODEL_NAME = "data_model.pickle"
    DICT_MODEL_NAME = "dictSim.pickle"
    INDEX_NAME = "gensim_index.pickle"

    #initial word tokenization
    if not os.path.exists(DATA_MODEL_NAME):
        print("loading data files from scratch")
        train_X, train_Y = load_robots_txt_files(alternate_path)
        save_model((train_X, train_Y), DATA_MODEL_NAME)
    else:
        print("loading data files by pickle")
        train_X, train_Y = load_model(DATA_MODEL_NAME)

    #create gensim dictionary
    if not os.path.exists(DICT_MODEL_NAME):
        print("loading gensim dict from scratch")
        gensim_dict = Dictionary(train_X)
        save_model(gensim_dict, DICT_MODEL_NAME)
    else:
        print("loading gensim dict with pickle")
        gensim_dict = load_model(DICT_MODEL_NAME)

    #create lookable index
    if not os.path.exists(INDEX_NAME):
        print("building index from scratch")
        iterator = tqdm(map(lambda x: gensim_dict.doc2bow(x), train_X))
        index = Similarity("gensim_index.models",
                           corpus=iterator,
                           num_features=len(gensim_dict) + 1,
                           num_best=100)
        save_model(index, INDEX_NAME)
    else:
        print("loading index with pickle")
        index = load_model(INDEX_NAME)
Ejemplo n.º 14
0
def check(docs, target):
    """
    Calculate the similarity between target and docs.

    Parameters
    ----------
    docs: list
        A list of strings to be compared against
    target: string
        The target string to be compared

    Returns
    -------
    float
        The percentage similarity

    """
    stemmer = PorterStemmer()

    tok_docs = [tokenize(text) for text in docs]
    stem_docs = [[stemmer.stem(word) for word in doc] for doc in tok_docs]

    dictionary = Dictionary(stem_docs)
    corpus = [dictionary.doc2bow(doc) for doc in stem_docs]
    tfidf = TfidfModel(corpus)
    sims = Similarity('/tmp/sims.index',
                      tfidf[corpus],
                      num_features=len(dictionary))

    query = [stemmer.stem(word) for word in tokenize(target)]
    query_bow = dictionary.doc2bow(query)
    query_tfidf = tfidf[query_bow]

    return sum(sims[query_tfidf]) / len(sims[query_tfidf])
Ejemplo n.º 15
0
def get_bow(graph, with_children=False):
    docs = []

    for vertex in graph.vertices():
        articles_text = ""
        for article in graph.vp.articles[vertex]:
            articles_text = articles_text + article

        docs.append(articles_text.split())

    # create & save a dictionary

    # # remove common words and tokenize
    stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document if word not in stoplist]
             for document in docs]
    #
    # # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]
    dictionary = corpora.Dictionary(texts)
    for vertex in graph.vertices():
        category_articles = string.join(graph.vp.articles[vertex]).split()
        graph.vp.bow[vertex] = dictionary.doc2bow(category_articles)
    dictionary.save('/tmp/bag_of_words.dict')
    corpus = [graph.vp.bow[vertex] for vertex in graph.vertices()]
    index = Similarity('/tmp/tst',
                       corpus=corpus,
                       num_features=dictionary.__len__())
Ejemplo n.º 16
0
def main(dataset_path):
    if not os.path.exists('../data/retriever/paragraph-ids.txt'):
        print('Writing paragraph ID to file...')
        with open('../data/retriever/paragraph-ids.txt', 'w') as f:
            for paragraph_id in load_ids(dataset_path):
                f.write(paragraph_id + '\n')

    dictionary_path = '../data/retriever/dct.pkl'
    if not os.path.exists(dictionary_path):
        print('Creating dictionary...')
        st = time.time()
        dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000)
        dct.save(dictionary_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating dictionary in {et - st}s.')
    else:
        print('Loading dictionary...')
        dct = Dictionary.load(dictionary_path)
        print('Dictionary loaded.')

    tfidf_path = '../data/retriever/tfidf.pkl'
    if not os.path.exists(tfidf_path):
        print('Creating model...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        model = TfidfModel(corpus)
        model.save(tfidf_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating model in {et - st}s.')
    else:
        print('Loading model...')
        model = TfidfModel.load(tfidf_path)
        print('Model loaded.')

    index_path = '../data/retriever/indexes/master-index'
    if not os.path.exists(index_path):
        print('Creating index...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        index = Similarity('../data/retriever/indexes/index', model[corpus],
                           len(dct))
        index.save(index_path)
        et = time.time()
        print(f'\rFinished creating index in {et - st}s.')
        print('Done')
    else:
        print('Nothing to do. Exiting...')
Ejemplo n.º 17
0
def find_answer(question):
    # 对输入的问题进行分词
    question.replace('\t', '').replace(' ', '')  # .replace('\n', '')
    question_gen = jieba.cut(question)
    questionList = list(question_gen)
    question_seg = " ".join(questionList)
    print(question_seg)
    print(question, question_gen, questionList, question_seg)
    answerList = []

    # 判断问题是否可以用知识库数据解决
    if is_KB_QA(question_seg):
        print("Is KB QA:")
        info_list = KB_answer(questionList)
        for answer in info_list:
            answerDic = {}
            answerDic["answer"] = answer[2] + "为" + answer[3]
            answerDic["percentage"] = (int)(answer[0] * 100)
            answerList.append(answerDic)
    # 如果答案列表为空,在以回答的问题中寻找相似答案
    if not answerList:
        print("Is not KB QA:")
        # 建立问题和回答的字典
        dic = {}
        question, answer = getSellerQA(item_id)
        #with open(SENTENCE_PATH, "r", encoding="utf-8") as question:
        #    with open(ANSWER_PATH, "r", encoding="utf-8") as answer:
        for q, a in zip(question, answer):
            dic[q] = a
        # 读取已经完成分词的语料库
        sentences = []
        for line in question:
            line.replace('\t', '').replace(' ', '')  # .replace('\n', '')
            seg_list = jieba.cut(line)
            sentences.append(list(seg_list))
        print('input done')
        # 生成字典和向量语料
        #pprint(sentences)
        dictionary = corpora.Dictionary(sentences)
        corpus = [dictionary.doc2bow(text) for text in sentences]
        index = Similarity('-Similarity-index', corpus, num_features=400)
        print("training done:", list(question_gen))
        # 找到与提出的问题最相似的已有问题
        resultList = find_simillar(questionList, dictionary, index)
        # 将得到的答案整合到一个List中并返回
        for answer in resultList:
            answerDic = {}
            # answerList.append(''.join(sentences[answer[0]]))
            answerDic["answer"] = dic[''.join(sentences[answer[0]])]
            answerDic["percentage"] = (int)(answer[1] * 100)
            answerList.append(answerDic)
            #answerList.append(dic[''.join(sentences[answer[0]])])
            #print(dic[''.join(sentences[answer[0]])])
        print(resultList)
    reDic = {}
    reDic["answer"] = answerList
    reDic["cnt"] = len(answerList)
    print(reDic)
    return reDic
Ejemplo n.º 18
0
def get_docsim_feature(contents, remarks=""):

    dictionary_path = Config.cache_dir + "/docsim/dic_%s.pkl" % remarks
    corpus_path = Config.cache_dir + "/docsim/corpus_%s.pkl" % remarks
    corpora_documents = []
    tokenizer = Tokenizer()
    for item_text in contents:
        item_str = tokenizer(item_text)
        corpora_documents.append(item_str)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    similarity = Similarity('-Similarity-index', corpus, num_features=300)
    similarity.num_best = 3
    pickle.dump(dictionary, open(dictionary_path, "wb"), protocol=4)
    pickle.dump(corpus, open(corpus_path, "wb"), protocol=4)

    return similarity, corpus
Ejemplo n.º 19
0
 def similarity(self,sent1,sent2):
     text1 = self.wordTokenize(sent1)
     text2 = self.wordTokenize(sent2)
     texts = [text1, text2]
     dictionary = corpora.Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary))
     return similarity[dictionary.doc2bow(text1)][1]
 def similarity_matrix(self, corpus, dictionary):
     """Compute cosine similarity against a corpus of documents by storing the index matrix in memory."""
     # index = MatrixSimilarity(corpus, num_features=len(dictionary))
     index_temp = get_tmpfile("index")
     index = Similarity(index_temp, corpus,
                        num_features=len(dictionary))  # create index
     for sims in index[corpus]:
         pprint(sims)
Ejemplo n.º 21
0
def get_sim(f1, f2):
    c1 = open(f1, encoding='utf8').read()
    c1 = removePunctuation(c1)
    print(c1)
    # jieba 进行分词
    data1 = jieba.cut(c1)
    data11 = ""
    # 获取分词内容
    for i in data1:
        data11 += i + " "
    doc1 = [data11]
    # 检验分词,程序成功可去掉
    print("分词内容:\n")
    print(doc1)

    t1 = [[word for word in doc.split()] for doc in doc1]
    # print(t1)

    #  frequence频率
    freq = defaultdict(int)
    for i in t1:
        for j in i:
            freq[j] += 1
    # print(freq)

    # 限制词频
    '''t2 = [[token for token in k if freq[j] >= 3]
        for k in t1]
    '''

    # corpora语料库建立字典

    dic1 = corpora.Dictionary(t1)

    # 对比文件
    c2 = open(f2, encoding='utf8').read()
    c2 = removePunctuation(c2)

    # jieba 进行分词
    data2 = jieba.cut(c2)
    data21 = ""
    for i in data2:
        data21 += i + " "
    new_doc = data21
    # print(new_doc)
    # doc2bow把文件变成一个稀疏向量
    new_vec = dic1.doc2bow(new_doc.split())
    # 对字典进行doc2bow处理,得到新语料库
    new_corpor = [dic1.doc2bow(t3) for t3 in t1]
    # 特征数
    featurenum = len(dic1.token2id)
    # SparseMatrixSimilarity 稀疏矩阵相似度
    idx = Similarity('-Similarity-index', new_corpor, featurenum)
    sims = idx[new_corpor]
    f = open(r'/output.txt', 'w')
    print('%.2f' % sims, file=f)
    f.close()
    print('%.2f' % sims)
Ejemplo n.º 22
0
def main():
    stopword = open('D:\code/test\哈工大停用词表.txt', encoding='utf8')  # 获取停用词列表
    stopwordlist = list(jieba.cut(stopword.read()))
    print(stopwordlist)
    try:
        orig_path, add_path, save_path = sys.argv[1:4]
    except Exception as e:
        print(sys.argv)
        print(e)
   # save_path = 'D:\code/test/out.txt'
    # 源文本预处理
  #  orig_path = 'D:\code/test/orig.txt'
    orig_file = open(orig_path, 'r', encoding="utf-8")
    text = orig_file.read()
    text = remove_punctuation(text)
    text = list(text)
    afterswlis = []
    for each in text:
        if each not in stopwordlist:
            afterswlis.append(each)
        else:
            continue
    text = afterswlis
    text ="".join(text)
    orig_file.close()
    # 预处理查重文本
   # add_path = 'D:\code/test/orig_0.8_dis_15.txt'
    add_file = open(add_path, 'r', encoding="utf-8")
    add_text = add_file.read()
    add_file.close()
    add_text = remove_punctuation(add_text)
    add_text = list(add_text)
    afterswlis = []
    for each in add_text:
        if each not in stopwordlist:
            afterswlis.append(each)
        else:
            continue
    add_text = afterswlis
    add_text = "".join(add_text)
    # 文本转向量
    texts = [jieba.lcut(text)]
    dictionary = corpora.Dictionary(texts)
    num_features = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    add_vec = dictionary.doc2bow(jieba.lcut(add_text))
    # 向量计算相似度
    similarity = Similarity('-Similarity-index', corpus, num_features)
    # 转换类型,切片保留两位小数
    a = similarity[add_vec]
    b = a[0]
    b = str(b).split('.')[0] + '.' + str(a).split('.')[1][:2]
    print("相似的计算结果:%s" % b)
    # 输出结果写入指定文档
    f = open(save_path, 'w', encoding="utf-8")
    f.write("相似的计算结果:%s" % b)
    f.close()
	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		# Guardamos las similitudes en un archivo con un formato sencillo
		# NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO
		for idioma, salida in self.output()['langs'].iteritems():
			file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma))
			for n_topics, o in salida.iteritems():
				index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path)

				# JSON
				sims = index2dict(index, file_list, num_sims=self.num_similar_docs)
				with o['json'].open('w') as f:
					json.dump(sims, f)

				# HTML + CSV
				s = u''
				net = pd.DataFrame(columns=['from_name', 'to_name', 'sim'])
				for book, v in sims.iteritems():
					s += u'-------------------------------------------\n'
					s += u'### %s\n\n' % (book)
					s += u'| Ranking | Libro | Similitud |\n|:--------:|:-------|-------------:|\n'''
					for rank, attrs in v.iteritems():
						s += u'| %d | %s | %f |\n' % (rank, attrs['name'], round(attrs['similarity'],3))
						net = net.append(pd.DataFrame({'from_name':[book], 'to_name':[attrs['name']], 'sim':[attrs['similarity']]}))
					s += u'\n\n'
				md = markdown.markdown(s, extensions=['markdown.extensions.tables'])
				books = sorted(list(set(net['from_name']).union(net['to_name'])))
				ids = {v:i for i,v in enumerate(books)}
				net['from'] = [ids[k] for k in net['from_name']]
				net['to'] = [ids[k] for k in net['to_name']]

				with o['html'].open('w') as f:
					f.write(md)
				with o['csv'].open('w') as f:
					net.to_csv(f, index=False)

				# Red (en R)
				tempname = 'net_temp0.html'
				i = 1
				while os.path.exists(tempname):
					tempname = 'net_temp%d.html' % i
					i += 1
					if i >= 100:
						print 'ERROR: No se puede crear la red temporal... Checa que no exista un archivo llamado %s en esta carpeta y que tienes permisos de escritura...' % tempname
						break
				subprocess.call(['itam-d3-network.R', '--input', o['csv'].path, '--output', tempname, '--max_links', str(self.num_similar_docs), '--min_sim', str(self.min_similarity)])
				print 'USER INFO: Creando archivo temporal: ' + tempname
				shutil.move(tempname, o['net'].path)
				print 'USER INFO: Movimiento listo, %s --> %s' % (tempname, o['net'].path)
				
				if os.path.exists(tempname):
					os.remove(tempname)
Ejemplo n.º 24
0
 def tf_text2vector(self):
     try:
         dct = self.tf_parameters["tf_dictionary"]
         rules, corpus =  zip(*self.tf_parameters["tf_rules_corpus"])
         txt_corp = dct.doc2bow(self.lemm_txt.split())
         index = Similarity(None, corpus, num_features=len(dct)) 
         rules_similarity = list(zip(rules, index[txt_corp]))
         return rules_similarity
     except:
         return None
Ejemplo n.º 25
0
 def GetLsm(self, dictionary, corpus):
     lsi = models.lsimodel.LsiModel(
         corpus, id2word=dictionary)  #num_topics=len(corpus)/2
     vec_lsi = lsi[corpus[0]]
     index = Similarity('l_index', corpus, len(dictionary))
     cnt = 0
     for similarities in index:
         if cnt == 1:
             return list(enumerate(similarities))
         cnt += 1
Ejemplo n.º 26
0
 def GetTfidf(self, dictionary, corpus):
     tfidf = models.TfidfModel(corpus)
     vec_lsi = tfidf[corpus[0]]
     index = Similarity('t_index', corpus, len(dictionary))
     #tsims = index[vec_lsi]
     cnt = 0
     for similarities in index:
         if cnt == 1:
             return list(enumerate(similarities))
         cnt += 1
Ejemplo n.º 27
0
def get_sim(model, corps):
    """get Similarity for corpus and model

    Args:
        model (TfIdfModel): TfIdf model to develop Similarity
        corps (Dictionary): Dictionary of words 

    Returns:
        [type]: [description]
    """
    return Similarity(None, model[corps], num_features=400)
Ejemplo n.º 28
0
 def __init__(self, loader_obj):
     self.model_types = [("lsi", None)]
     self.model = loader_obj
     self.tknz = TokenizerApply(self.model)
     self.tkz_model = self.tknz.model_tokenize()
     self.et_vectors = self.tkz_model.application_field["texts"]
     self.coeffs = self.tkz_model.application_field["coeff"]
     self.tags = self.tkz_model.application_field["tags"]
     self.index = Similarity(
         None,
         self.et_vectors,
         num_features=self.model.texts_algorithms["num_topics"])
Ejemplo n.º 29
0
def main(path="train.json"):
    #get a random question
    quest = select_question(path)
    print("Random question : ")
    print(quest)
    #Tokenize and create gensim dictionnary
    dictionary, corpus_quest = process_question(quest)
    tfidf = gensim.models.TfidfModel(corpus_quest)
    #corpus of contexts processing
    ctx = import_context(path)
    corpus = process_contexts(ctx)
    #Global corpus dictionnary
    corpus = final_process_context(corpus, dictionary)
    dir_for_index = get_tmpfile("index_sim")
    #Similarity function to compare each context to the question
    sim = Similarity(dir_for_index, corpus, num_features=len(dictionary))
    #result list of similarity scores
    res = (sim[corpus_quest].tolist()[0])

    #Get 3 best most similar context from result list
    max_index = sorted(range(len(res)), key=lambda sub: res[sub])[-3:]
    #create dict of index (to be able able to find the context in the context list) and similarity value
    dict_best = {}
    for e in max_index:
        dict_best[e] = res[e]
    #get index of best falue
    best_index = max(dict_best, key=dict_best.get)
    print("Best context")
    print(ctx[best_index])
    #use function sim_metric to find out if it is the appropriate context (it will return 1)
    sim_accuracy = sim_metric(quest, ctx[best_index])
    print("similarity metric", sim_accuracy)

    #Look for other good solutions if the first option is not satisfactory
    #top3metric tells if there is a adequate solution in the 3 most similar context returned
    #if the first example was a good fit , it will automatically return 1
    top3metric = sim_accuracy
    if sim_accuracy == 0:
        other_solutions_index = []
        for j, value in dict_best.items():
            if (j != best_index):
                other_solutions_index.append(j)
        if len(other_solutions_index) != 0:
            print("Autres solutions possibles")
            for k in other_solutions_index:
                #print(ctx[k])
                metric = sim_metric(quest, ctx[k])
                if (metric == 1):
                    top3metric = 1
                print("similarity metric", metric)

    return [sim_accuracy, top3metric]
Ejemplo n.º 30
0
 def lsi_indexes_fill(self):
     try:
         dct = self.kwargs["lsi_parameters"]["dictionary"]
         lsi_model = self.kwargs["lsi_parameters"]["model"]
         rules, corpus = zip(*self.kwargs["lsi_parameters"]["rules_corpus"])
         txt_corp = dct.doc2bow(self.lemm_txt.split())
         txt_vect = lsi_model[txt_corp]
         corpus_vects = [lsi_model[x] for x in corpus]
         index = Similarity(None, corpus_vects, num_features=self.kwargs["lsi_parameters"]["num_topics"])
         rules_similarity = list(zip(rules, index[txt_vect]))
         return rules_similarity
     except:
         return None
Ejemplo n.º 31
0
 def tfidf_text2vector(self):
     try:
         dct = self.tfidf_parameters["tf_idf_dictionary"]
         tfidf_model = self.tfidf_parameters["tfidf_model"]
         rules, corpus =  zip(*self.tfidf_parameters["tf_idf_rules_corpus"])
         txt_corp = dct.doc2bow(self.lemm_txt.split())
         txt_tf_idf_vect = tfidf_model[txt_corp]
         corpus_tf_idf_vects = [tfidf_model[x] for x in corpus]
         index = Similarity(None, corpus_tf_idf_vects, num_features=len(dct)) 
         rules_similarity = list(zip(rules, index[txt_tf_idf_vect]))
         return rules_similarity
     except:
         return None
Ejemplo n.º 32
0
    def load(self, path):
        if type(path) == str:
            path = Path(path)

        with open(path / 'paragraph-ids.txt') as f:
            self.paragraph_ids = [paragraph_id.strip() for paragraph_id in f]

        dictionary_path = str(path / 'dct.pkl')
        self.dictionary = Dictionary.load(dictionary_path)

        index_path = str(path / 'indexes' / 'master-index')
        self.index = Similarity.load(index_path)
        self.index.num_best = self.num_best
Ejemplo n.º 33
0
 def __init__(self, corpus, num_features, num_clusters, max_iterations):
     
     self.similarity_index = Similarity(output_prefix = 'similarities', 
                                        corpus = corpus,
                                        num_features = num_features)
     
     self.num_docs = len(self.similarity_index)
     self.num_clusters = num_clusters
     self.max_iterations = max_iterations
     self.num_features = num_features
     self.corpus = corpus
     
     self.MIN_CLUSTER_SIZE = 2
Ejemplo n.º 34
0
	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		# Guardamos las similitudes en un archivo con un formato sencillo
		# NOTA: EL ÍNDICE YA DE POR SÍ GUARDA LAS SIMILITUDES. NO ES NECESARIO CALCULARLAS DE NUEVO
		for idioma, salida in self.output()['langs'].iteritems():
			file_list = os.listdir(os.path.join(self.txt_dir,kind,idioma))
			for n_topics, o in salida.iteritems():
				index = Similarity.load(self.input()['langs'][idioma][n_topics]['lsi-index'].path)
				sims = arrange_similarities(index, file_list, num_sims=self.num_similar_docs)
				sims = '\n'.join(['\t'.join([str(i) for i in t]) for t in sims])
				with o.open('w') as f:
					f.write(sims)
Ejemplo n.º 35
0
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Ejemplo n.º 36
0
log_entropy[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

#tfidf = TfidfModel(BOW_corpus)
#tfidf.save('../models/tfidf.model') #already provided
tfidf = TfidfModel.load('../models/tfidf.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
tfidf[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

print('Creating Similarity Index')
logent_corpus = MmCorpus('../data/log_entropy_matrix')
num_feat = len(wiki.dictionary.keys())
index = Similarity('../data/logEntropyShards/logEntropySimilarity',
logent_corpus, num_features=num_feat)

index.save('../data/logEntropyShards/logEntropySimilarityIndex')
print('Saved Shards and similarity index')

print('Getting list of titles...')
bz2_wiki = bz2.BZ2File(wiki_file, "r")
extract = corpora.wikicorpus.extract_pages(bz2_wiki)
i = 0
matches = open('../data/title_matches.txt','a')
for title,doc,z in extract:
	wiki_filt = corpora.wikicorpus.filter_wiki(doc)
	doc_token = corpora.wikicorpus.tokenize(wiki_filt)
	bowbow = diction.doc2bow(doc_token)
	if bowbow == BOW_corpus[i]:
		i+=1
Ejemplo n.º 37
0
class KMedoids(object):
    '''
    Implementation of kmedoids clustering.
    There are two ways to find a medoid:
    - Use the element which is closest to the centroids. 
    This is the "kmeans based" kemdoids.
    - Use the elemen which has the smalles summed distance to the other cluster
    member. This is the "kmedian based" kmedoids.
    
    So far this implementation uses the kmeans based approach.
    '''
    
    def __init__(self, corpus, num_features, num_clusters, max_iterations):
        
        self.similarity_index = Similarity(output_prefix = 'similarities', 
                                           corpus = corpus,
                                           num_features = num_features)
        
        self.num_docs = len(self.similarity_index)
        self.num_clusters = num_clusters
        self.max_iterations = max_iterations
        self.num_features = num_features
        self.corpus = corpus
        
        self.MIN_CLUSTER_SIZE = 2
        
    def get_medoids(self):
        '''
        Retuirns a Matrix containing the medoids
        '''
        return self.medoid_similarity_index.index
        
    def __medoid_generator(self):
        '''
        Yields all medoid documents
        '''
        for medoid_id in self.medoids.iterkeys():
            yield self.similarity_index.vector_by_id(medoid_id)
            
    def __create_medoid_similarity_index(self):
        self.medoid_similarity_index = MatrixSimilarity(
                                           corpus = list(self.__medoid_generator()),
                                           num_features = self.num_features)        
        
    def __random_init_medoids(self):
        #the keys are the indices of the medoids
        #the values are indices list of the elements belonging to medoid
        self.medoids = defaultdict(list)
        
        #init random medoids
        for x in xrange(self.num_clusters):
            medoid_index = random.randrange(self.num_docs)
            self.medoids[medoid_index] = []  
            
        #create similarity index of medoids  
        self.__create_medoid_similarity_index()
            
    def __assign(self):
        #We use cosine-similarity as metric
        #NOTE: the closer the cosine is to 1 the closer the documents are
        
        #the cosine distance is in <-1, 1> where 1 is the closest and -1 the farthest
        #we might convert it to <0, 2> where 0 is the closest and 2 the farthest in the future
        #dis = (dis * -1) +1 

        
        #clear all clusters
        for id, _ in self.medoids.iteritems():
            self.medoids[id] = []
        
        #assign each doc to closest medoid
        args = itertools.izip(enumerate(self.corpus), 
                              itertools.repeat(self.medoid_similarity_index))
        pool = multiprocessing.Pool(POOL_SIZE)
        #for id, pos in pool.imap_unordered(assign_doc_to_cluster, args, 
        #                         chunksize= CHUNK_SIZE):
        for id, pos in pool.imap_unordered(assign_doc_to_cluster, args):
            self.medoids[self.medoids.keys()[pos]].append(id)
            
    def __get_centroid(self, cluster):
        #averages all docs in cluster
        count = 0
        centroid = numpy.zeros(self.num_features, dtype=numpy.float32)
        for doc_id in cluster:
            doc = self.similarity_index.vector_by_id(doc_id).toarray().flatten()
            #full_doc = matutils.sparse2full(doc, self.num_features)
            
            centroid = centroid + doc
            count += 1
            
        if count != 0:
            centroid = centroid / count
            
        return matutils.full2sparse(centroid)
            
    def __recalculate_medoids(self):
        changed = False
        count = 0
        for medoid_id, cluster in self.medoids.items():
            if count % 1000 == 0:
                logger.info("PROGRESS: Recalculate medoid for cluster #%d id%d" 
                            % (count, medoid_id))
            count +=1 
            
            if len(cluster) < self.MIN_CLUSTER_SIZE:
                #cluster is too small, init a new random medoid
                #remove medoid
                del self.medoids[medoid_id]
                
                #add new random medoid. the id could already be used as medoid.
                # for now we just risk our it ;)
                medoid_index = random.randrange(self.num_docs)
                self.medoids[medoid_index] = [] 
                
                changed = True
            else:
                
                logger.debug("Find new centroid for cluster %d." % medoid_id)
                
                #calculate centroid and assign closest doc as new medoid
                centroid = self.__get_centroid(cluster)   
                
                
                old_num_best = self.similarity_index.num_best
                
                #similarity index should only return the best fit
                self.similarity_index.num_best = 1
                try:
                    new_medoid_id, _ = self.similarity_index[centroid][0]
                except IndexError as e:
                    logger.error("Could not find best fit for centroid: %s." % (e))
                    #use random medoid index
                    new_medoid_id = random.randrange(self.num_docs)
                
                self.similarity_index.num_best = old_num_best
                
                if new_medoid_id != medoid_id:
                    changed = True
                    #remove old medoid
                    del self.medoids[medoid_id]
                
                #empty medoid in any case
                self.medoids[new_medoid_id] = []

        if changed:
            self.__create_medoid_similarity_index()
        
        return changed
        
    def cluster(self):
        logger.info("Init random medoids.")
        self.__random_init_medoids()
        
        logger.info("Assign elements to random clusters.")
        self.__assign()
        
        changed = True
        count = 0
        while changed and count < self.max_iterations:
            changed = False
            count += 1
            
            logger.info("Entering iteration #%d." % count)
            
            #recalculate medoids
            logger.info("Recalculate medoids.")
            changed = self.__recalculate_medoids()
            
            #assign all doc to medoids
            logger.info("Assign elements to new clusters.")
            assignment = self.__assign()
            
        if count < self.max_iterations:
            logger.info("Converged in %d iterations." % count)
        else:
            logger.info("May not have converged after %d iterations." % 
                        self.max_iterations)
        return self.medoids
Ejemplo n.º 38
0
    input_file, output_prefix = sys.argv[1:3]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(output_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True
    similarity_index.preload_reverse_index()

    logger.info("Finished loading model files.")

    logger.info("Processing input documents...")

    try:
        infile = open(input_file, 'r')
    except IOError:
        print('cannot open %s' % (input_file,))
        sys.exit(1)

    for docnum, line in enumerate(infile):
        line = line.rstrip()
Ejemplo n.º 39
0
# load models

print "\n    Loading models, etc..\n"
id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary')
tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model')
lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model')
indexfile = ('./data/ta_index.txt')
queryfile = './queryfiles/queryfile.txt'  # text in corpus
# queryfile = './queryfiles/45vuotta.txt'  # Film review
# queryfile = './queryfiles/tktjohdessee2.txt'  # Ancient essay

# check similarity

print "\n    Load similarity indices.\n"
index = Similarity.load('./data/pgfin_index.index')
index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index')

with open(queryfile, 'r') as datafile:
    query = datafile.read()

# vectorize the query text into bag-of-words and tfidf
query_bow = id2word_pgfin.doc2bow(tokenize(query))
query_tfidf = tfidf_model[query_bow]
query_lsi = lsi_model[query_tfidf]

index_dense.num_best = 5


class BookHitValue(object):
Ejemplo n.º 40
0
import gensim
from gensim.similarities import Similarity, MatrixSimilarity

# from pgfin_timing import Timer

from pgfin_helpers import tokenize


logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore


# load the corpora

print "\n    Loading corpora.\n"
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm')
# lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm')
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm')
lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm')
# print(tfidf_corpus)
# print(lsi_corpus)

print "\n    Start similarity index.\n"
index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms)
index.save('./data/pgfin_index.index')  # save to disk
# print index
index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms)
index_dense.save('./data/pgfin_matrixindex.index')  # save to disk
# print index_dense
passwd = "8269202"
DBName = "bullhorn"

db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True)

app = Flask(__name__)
CORS(app)

resultTuple = generateCorpus()
dictionary = resultTuple['dictionary']
corpus = resultTuple['corpus']
socTitleDict = resultTuple['socTitleDict']

num_topics = 200
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)
gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics)
gensimIndex.num_best = 3


@app.before_request
def before_request():
    db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True)
    resultTuple = generateCorpus()
    # dictionary = resultTuple['dictionary']
    # corpus = resultTuple['corpus']
    # socTitleDict = resultTuple['socTitleDict']
    #
    # num_topics = 200
    # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)
    # gensimIndex = Similarity('/tmp/tst', lsi[corpus], num_features=num_topics)
    # gensimIndex.num_best = 3
Ejemplo n.º 42
0
    model_prefix = sys.argv[1]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True

    logger.info("Finished loading model files.")

    mismatches = 0
    for doc_idx in range(0, len(similarity_index)):
        logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx]))
        rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64)
        fwd_doc = similarity_index.vector_by_id(doc_idx)
        for feature_id, val in enumerate(fwd_doc.toarray().flatten()):
            if val == 0: continue
            feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten()
            rev_doc[0, feature_id] = feat_rev_docs[doc_idx]
        rev_doc = rev_doc.tocsr()