def main(): logging.basicConfig(format=DefaultSetting.FORMAT_LOG, level=logging.INFO) start_time = datetime.now() input_file = 'data/content.with.categories.seg.vni' stopwords_file = 'data/stopwords.txt' num_topics = 100 prefix_name = 'demo' directory = 'tmp' query = 'data/query.txt' corpus = Corpus() corpus.build_corpus(input_file, stopwords_file, directory, prefix_name) LDA = LDAModel() LDA.train(corpus.corpus, corpus.dictionary, num_topics, directory, prefix_name) LDA.show() docsim = DocSim() docsim.set_model(LDA.model) docsim.set_doc(corpus) docsim.vectorized(num_topics) # docsim.save(directory, prefix_name) print 'Training time: ', datetime.now() - start_time start_time = datetime.now() reader = codecs.open(query, 'r', 'utf8') documents = [] for line in reader.readlines(): documents.append(line.replace('\n', '')) docsim.query(documents, True, directory, prefix_name) docsim.query(documents, False, directory, prefix_name) print 'Query time: ', datetime.now() - start_time
def compare_algs_tfidf_simhashtfidf(self): token_length = 1 test_set = self.generate_random_triples() ds = DocSim(self.document_set) sh = SimHashTfIdf(self.document_set) total = float(len(test_set)) correct = 0. for t1, t2, t3 in test_set: dsim1 = ds.similarity(t1, t2) dsim2 = ds.similarity(t1, t3) ssim1 = sh.similarity(t1, t2) ssim2 = sh.similarity(t1, t3) if ((abs(dsim1 - 0) < 0.000001 and abs(dsim2 - 0) < 0.000001)): total -= 1. continue db = dsim1 < dsim2 sb = ssim1 < ssim2 if db == sb: correct += 1. # print len(test_set) # print total return correct / total
def compare_algs_tfidf_simhash(self): token_length = 1 test_set = self.generate_random_triples() simhash_dict = {} for title in self.doc_list: simhash_dict[title] = SimHash(self.documents[title], 128, token_length) ds = DocSim(self.document_set) total = float(len(test_set)) correct = 0. for t1, t2, t3 in test_set: dsim1 = ds.sim(t1, t2) dsim2 = ds.sim(t1, t3) ssim1 = simhash_dict[t1].similarity(simhash_dict[t2]) ssim2 = simhash_dict[t1].similarity(simhash_dict[t3]) if (abs(dsim1 - 0) < 0.000001 and abs(dsim2 - 0) < 0.000001): total -= 1. continue db = dsim1 < dsim2 sb = ssim1 < ssim2 if db == sb: correct += 1. return correct / total
def benchmark_memory_tfidf(self, iterations): ds = DocSim(self.document_set) for i in range(iterations): title1 = self.select_random_document() title2 = self.select_random_document() sim = ds.similarity(title1, title2) return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
def benchmark_tfidf(self, iterations): t0 = time.clock() ds = DocSim(self.document_set) for i in range(iterations): title1 = self.select_random_document() title2 = self.select_random_document() sim = ds.similarity(title1, title2) t1 = time.clock() span = t1 - t0 return span
def compare_tfidf_replace(self, iterations): REPLACEMENT_LIMIT = 1.0 ds = DocSim(self.document_set) correct = 0. for x in range(iterations): rand_doc = self.document_set.get_random_document() ratio1 = random.random() * REPLACEMENT_LIMIT ratio2 = random.random() * REPLACEMENT_LIMIT shorter = ratio1 if ratio1 < ratio2 else ratio2 longer = ratio2 if ratio1 < ratio2 else ratio1 body1 = self.replace(rand_doc[1], shorter) body2 = self.replace(rand_doc[1], longer) sim1 = ds.similarity_new_doc(rand_doc[0], body1) sim2 = ds.similarity_new_doc(rand_doc[0], body2) if sim1 > sim2: correct += 1. return correct / iterations
def fixture_DocSim(mocker): # Test fixture docsim = DocSim() return docsim
from flask import Flask from flask import render_template from flask import request from docsim import DocSim app = Flask(__name__) from gensim.models.keyedvectors import KeyedVectors # импорт w2v загрузчика model_path = 'ruwikiruscorpora-nobigrams_upos_skipgram_300_5_2018.vec' # путь к w2v модели w2v_model = KeyedVectors.load_word2vec_format(model_path) # загружаем в память ds = DocSim( w2v_model=w2v_model, dataset_path="прога курсач.csv" ) # инициализируем класс и передаем в него данные, модель и путь к файлу с бд @app.route('/', methods=['GET', 'POST'] ) # разрешаем работу с этим методом через POST и GET def hello_world(): if request.method == 'POST': # если пост source_doc = ds.text_POS_tag( request.form['question']) # ищем наиболее похожее обращение answer = ds.get_answer(source_doc) # получаем ответ context = { # передаем все в шаблон 'answer': answer[0], 'score': answer[1], 'question': request.form['question'] } return render_template('index.html', context=context) else: context = {'answer': None} # если GET то ответа на вопрос нет return render_template('index.html', context=context)