Esempio n. 1
0
def main():
    logging.basicConfig(format=DefaultSetting.FORMAT_LOG, level=logging.INFO)

    start_time = datetime.now()

    input_file = 'data/content.with.categories.seg.vni'
    stopwords_file = 'data/stopwords.txt'
    num_topics = 100
    prefix_name = 'demo'
    directory = 'tmp'
    query = 'data/query.txt'

    corpus = Corpus()
    corpus.build_corpus(input_file, stopwords_file, directory, prefix_name)
    LDA = LDAModel()
    LDA.train(corpus.corpus, corpus.dictionary, num_topics, directory,
              prefix_name)
    LDA.show()

    docsim = DocSim()
    docsim.set_model(LDA.model)
    docsim.set_doc(corpus)
    docsim.vectorized(num_topics)
    # docsim.save(directory, prefix_name)

    print 'Training time: ', datetime.now() - start_time

    start_time = datetime.now()
    reader = codecs.open(query, 'r', 'utf8')
    documents = []
    for line in reader.readlines():
        documents.append(line.replace('\n', ''))
    docsim.query(documents, True, directory, prefix_name)
    docsim.query(documents, False, directory, prefix_name)
    print 'Query time: ', datetime.now() - start_time
Esempio n. 2
0
    def compare_algs_tfidf_simhashtfidf(self):
        token_length = 1
        test_set = self.generate_random_triples()

        ds = DocSim(self.document_set)
        sh = SimHashTfIdf(self.document_set)

        total = float(len(test_set))
        correct = 0.
        for t1, t2, t3 in test_set:
            dsim1 = ds.similarity(t1, t2)
            dsim2 = ds.similarity(t1, t3)
            ssim1 = sh.similarity(t1, t2)
            ssim2 = sh.similarity(t1, t3)

            if ((abs(dsim1 - 0) < 0.000001 and abs(dsim2 - 0) < 0.000001)):
                total -= 1.
                continue
            db = dsim1 < dsim2
            sb = ssim1 < ssim2

            if db == sb:
                correct += 1.
        # print len(test_set)
        # print total
        return correct / total
Esempio n. 3
0
    def compare_algs_tfidf_simhash(self):
        token_length = 1
        test_set = self.generate_random_triples()
        simhash_dict = {}

        for title in self.doc_list:
            simhash_dict[title] = SimHash(self.documents[title], 128,
                                          token_length)

        ds = DocSim(self.document_set)

        total = float(len(test_set))
        correct = 0.
        for t1, t2, t3 in test_set:
            dsim1 = ds.sim(t1, t2)
            dsim2 = ds.sim(t1, t3)
            ssim1 = simhash_dict[t1].similarity(simhash_dict[t2])
            ssim2 = simhash_dict[t1].similarity(simhash_dict[t3])
            if (abs(dsim1 - 0) < 0.000001 and abs(dsim2 - 0) < 0.000001):
                total -= 1.
                continue
            db = dsim1 < dsim2
            sb = ssim1 < ssim2

            if db == sb:
                correct += 1.
        return correct / total
Esempio n. 4
0
    def benchmark_memory_tfidf(self, iterations):
        ds = DocSim(self.document_set)
        for i in range(iterations):
            title1 = self.select_random_document()
            title2 = self.select_random_document()
            sim = ds.similarity(title1, title2)

        return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
Esempio n. 5
0
    def benchmark_tfidf(self, iterations):
        t0 = time.clock()
        ds = DocSim(self.document_set)
        for i in range(iterations):
            title1 = self.select_random_document()
            title2 = self.select_random_document()
            sim = ds.similarity(title1, title2)

        t1 = time.clock()
        span = t1 - t0
        return span
Esempio n. 6
0
    def compare_tfidf_replace(self, iterations):
        REPLACEMENT_LIMIT = 1.0
        ds = DocSim(self.document_set)
        correct = 0.
        for x in range(iterations):
            rand_doc = self.document_set.get_random_document()
            ratio1 = random.random() * REPLACEMENT_LIMIT
            ratio2 = random.random() * REPLACEMENT_LIMIT
            shorter = ratio1 if ratio1 < ratio2 else ratio2
            longer = ratio2 if ratio1 < ratio2 else ratio1

            body1 = self.replace(rand_doc[1], shorter)
            body2 = self.replace(rand_doc[1], longer)

            sim1 = ds.similarity_new_doc(rand_doc[0], body1)
            sim2 = ds.similarity_new_doc(rand_doc[0], body2)

            if sim1 > sim2:
                correct += 1.
        return correct / iterations
def fixture_DocSim(mocker):
    # Test fixture
    docsim = DocSim()

    return docsim
Esempio n. 8
0
File: app.py Progetto: mr8bit/docbot
from flask import Flask
from flask import render_template
from flask import request
from docsim import DocSim
app = Flask(__name__)

from gensim.models.keyedvectors import KeyedVectors  # импорт w2v загрузчика
model_path = 'ruwikiruscorpora-nobigrams_upos_skipgram_300_5_2018.vec'  # путь к w2v модели
w2v_model = KeyedVectors.load_word2vec_format(model_path)  # загружаем в память
ds = DocSim(
    w2v_model=w2v_model, dataset_path="прога курсач.csv"
)  # инициализируем класс и передаем в него данные, модель и путь к файлу с бд


@app.route('/', methods=['GET', 'POST']
           )  # разрешаем работу с этим методом через POST и GET
def hello_world():
    if request.method == 'POST':  # если пост
        source_doc = ds.text_POS_tag(
            request.form['question'])  # ищем наиболее похожее обращение
        answer = ds.get_answer(source_doc)  # получаем ответ
        context = {  # передаем все в шаблон
            'answer': answer[0],
            'score': answer[1],
            'question': request.form['question']
        }
        return render_template('index.html', context=context)
    else:
        context = {'answer': None}  # если GET то ответа на вопрос нет
        return render_template('index.html', context=context)