コード例 #1
0
ファイル: database.py プロジェクト: pvt2345/KMS_IDRec
    def search(self, key_search):
        data = self.select('content')
        searcher = SearchEngine(key_search)
        result = []

        for d in data:
            content = d.get('content', ' ')
            if len(content.strip()) < 4:
                continue
            if len(
                    re.findall('|'.join(key_search.lower().split()),
                               content.lower())) == 0:
                continue

            titles = d.get('tieu_de', ' ').split('|')
            score = 0
            try:
                ok = True
                for i, title in enumerate(titles):
                    score_tieu_de, _ = searcher.LCS4Sentence(u'' + title)
                    score += (i + 1) / len(titles) * score_tieu_de * 2

                sentences = content.split('.')
                score_content = 0
                n_content = 0
                for sentence in sentences:
                    s_content, index = searcher.LCS4Sentence(u'' + sentence)
                    # score_content += s_content
                    # if s_content > 0:
                    #     n_content += 1
                    if s_content > score_content:
                        score_content = s_content
                score_content = 2 * score_content / (n_content + 1)
                _, index = searcher.LCS4Sentence(u'' + content)
                score += score_content
                if score < 0.3:
                    continue
                index = ';'.join(['{0}-{1}'.format(s, e) for s, e in index])
                reference = d.get('reference')  # d['stt']

                result.append({
                    'reference': reference,
                    'title': title.split('|')[-1],
                    'content': content,
                    'score': score,
                    'index': index
                })
            except Exception as e:
                print(e)
                ok = False
                # from text_mining.search_engine.search_engine import SearchEngine
                searcher = SearchEngine(key_search)

            if not ok:
                print('error')
                # print(searcher.LCS4Sentence('haha'))
        result = [r for r in result if r['score'] > 0.05]
        result = sorted(result, key=lambda r: r['score'], reverse=True)
        result = result[:20]
        return result
コード例 #2
0
    def __init__(self):

        if exists('database.dat'):
            # deserialize database is much faster.
            print('deserialize the QA database...')
            self.search_engine = SearchEngine('cppjieba/dict', 'database.dat')
        else:
            # load database from txt is slower.
            print('load from QA database from txt format...')
            self.search_engine = SearchEngine('cppjieba/dict')
            self.search_engine.loadFromTxt('question_answer.txt')
            self.search_engine.save('database.dat')
        self.predictor = Predictor()
コード例 #3
0
def search_in_clueweb_with_expanded_query():
    search_engine = SearchEngine()
    search_engine.action_word = request.form['action_word']
    search_engine.hint_word = request.form['hint_word']
    search_engine.find_related_action_words_with_google()
    search_engine.count_action_words()
    search_engine.sort_action_words_count()
    search_engine.pick_sorted_action_words_more_than_1_count()
    results = []
    for elem in search_engine.sorted_action_words_more_than_1_count:
        elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word']
        url = 'http://karen.dl.local:8983/solr/ClueWeb09ja/select?q=' + elem['expanded_query'] + '&wt=xml'
        web_page = WebPage(url)
        web_page.fetch_xml()
        web_page.pick_texts_to_result_pages()
        # クエリ1つごとに結果xmlページがある
        # 結果xmlページの内容を1ページずつWebPageオブジェクトにしてresult_pagesとして1クエリに対応する結果ページに持たせる
        for result_page in web_page.result_pages:
            # result_page.text_body
            result_page.set_lines_from_texts()
            result_page.set_line_nums_with_word(search_engine.action_word)
            result_page.set_line_nums_around_action_word()
            result_page.set_line_clusters_around_action_word()
        # web_page.result_pages[0].line_clusters_around_action_word
        results.append({'pages': web_page.result_pages, 'expanded_query': elem['expanded_query']})
    return render_template('search_in_clueweb_with_expanded_query.tmpl',
        results=results)
コード例 #4
0
ファイル: hw4_test.py プロジェクト: AdamK42/cse163-homework
def test_searchengine_fields():
    '''
    Tests the fields of a search engine after construction.
    '''
    test = SearchEngine(DIRECTORY)
    doc1 = Document(FILE1)
    doc2 = Document(FILE2)
    doc3 = Document(FILE3)

    assert_equals(3, test._total_documents)

    expected = {
        'i': [doc1, doc2, doc3],
        'like': [doc1, doc2],
        'apple': [doc1],
        'pie': [doc1, doc2],
        'is': [doc1, doc3],
        'super': [doc1],
        'duper': [doc1],
        'cool': [doc1, doc3],
        'also': [doc2],
        'chocolate': [doc2, doc3],
        'cake': [doc3],
        'guess': [doc3]
    }

    assert_equals(expected, test._all_terms)
コード例 #5
0
    def setUp(self):
        '''
        In this method we create an indexer, create a text file,
        index it, then delete the file and the indexer.
        Then create an object of SearchEngine()
        '''
        indexer = ToIndex('database')
        self.maxDiff = None
        text = open('test_text.txt', 'w')
        text.write('Ах, не говорите мне про Австрию! \
                    Я ничего не понимаю, может быть')
        text.close()
        another_text = open('another_test_text.txt', 'w')
        another_text.write('но Ах Австрия никогда не хотела и не хочет войны.\
                            Она предает нас')
        another_text.close()

        text1 = open('test_text1.txt', 'w')
        text1.write('ooh la la мама мыла раму123  frf34')
        text1.close()
        text2 = open('test_text2.txt', 'w')
        text2.write('мама мыла окно')
        text2.close()
        text3 = open('test_text3.txt', 'w')
        text3.write('мама мыла еще что-нибудь')
        text3.close()
        
        
        indexer.index_by_line('test_text.txt')
        indexer.index_by_line('another_test_text.txt')
        del indexer
        
        self.search_eng = SearchEngine('database')
コード例 #6
0
 def test_find_related_action_words_from_clueweb(self):
     se = SearchEngine()
     se.hint_word = '大学'
     se.action_word = '入学'
     se.set_solr_query()
     se.find_related_action_words_from_clueweb()
     self.assertEqual(len(se.result_pages), 1)
コード例 #7
0
    def test_find_pages(self):
        page_1 = WebPage('http://tradein.nissan.co.jp/')
        page_1.title = '自動車の下取りと売却'
        page_1.snippet = '自動車には下取りをする方法がけっこうある。'

        page_2 = WebPage('http://www.link-nexus.com/')
        page_2.title = '自動車の下取りと販売'
        page_2.snippet = 'あばばばばば'

        page_3 = WebPage('http://toyota.jp/service/tradein/dc/top')
        page_3.title = '下取り参考価格情報'
        page_3.snippet = '下取りと販売ですよプロデューサーさん'

        search_engine = SearchEngine()
        search_engine.material_pages = [page_1, page_2, page_3]
        search_engine.hint_word = '自動車'
        search_engine.action_word = '下取り'
        search_engine.find_pages_including_related_words()
        self.assertEqual(search_engine.result_pages[0], page_1)
        self.assertEqual(search_engine.result_pages[1], page_2)
        self.assertEqual(search_engine.result_pages[2], page_3)

        search_engine.count_action_words()
        self.assertEqual(search_engine.action_words_count, {'販売': 2, '売却': 1})

        search_engine.sort_action_words_count()
        self.assertEqual(search_engine.sorted_action_words, [{
            'word': '販売',
            'count': 2
        }, {
            'word': '売却',
            'count': 1
        }])
コード例 #8
0
 def print_result(self):
     self.query = self.entry.get()
     My_search_engine = SearchEngine(self.db)
     self.top_level = Toplevel(self.tk)
     self.top_level.geometry("500x500")
     self.top_level.title("The Result")
     result = My_search_engine.print_results(self.query)
     return_doc = My_search_engine.get_return_doc()
     if len(self.query) == 0:
         label = Label(self.top_level, text="Please enter your text again")
     elif len(result) == 0:
         label = Label(self.top_level, text="No result found")
     else:
         label = Label(self.top_level,
                       text="These are the top 20 results for query " +
                       self.query + ": ")
     label.pack()
     i = 0
     for single_result in result:
         single_link_and_snippet = Label(
             self.top_level,
             text=single_result + "  Its snippet: " +
             self.snippet_dict[return_doc[i]] + "...",
             fg="blue",
             cursor="hand2")
         single_link_and_snippet.pack()
         #single_snippet = Label(self.top_level, text="its snippet: "+ self.snippet_dict[return_doc[i]])
         i += 1
         single_link_and_snippet.bind("<Button-1>", self.show_content)
コード例 #9
0
ファイル: hw4_test.py プロジェクト: danielqiang/cse163
def test_search_engine():
    """Tests the SearchEngine class"""
    engine = SearchEngine('test_search')
    assert_equals(None, engine.search('asdf'))
    assert_equals(['test_search/file.txt'], engine.search('dog'))
    assert_equals(['test_search/file.txt', 'test_search/file1.txt'],
                  engine.search('test'))
コード例 #10
0
ファイル: main.py プロジェクト: 1160300901/search-engine
def searchidlist(key, selected=0):
    global page
    global doc_id
    se = SearchEngine('../config.ini', 'utf-8')
    flag, id_scores = se.search(key, selected)
    # 返回docid列表
    doc_id = [i for i, s in id_scores]

    # TODO 根据用户等级过滤掉部分数据
    global dir_path, db_path, rank
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    doc_id_rank = []
    for id in doc_id:
        c.execute("SELECT * FROM news WHERE id=?", (id, ))
        fetch = c.fetchone()
        if fetch[5] < rank:
            doc_id_rank.append(id)
    doc_id.clear()
    doc_id = doc_id_rank.copy()
    print("**rank**: ", rank)
    print("**doc_id**: ", doc_id)
    page = []
    for i in range(1, (len(doc_id) // 10 + 2)):
        page.append(i)
    return flag, page
コード例 #11
0
def main():
    # Instance
    generator = SearchEngineGenerator()
    search = SearchEngine()

    # Generate invert index
    ans = input('Do you want to build scores?[type \'y\' to build or pass]: ')
    if ans == 'y':
        generator()
    # generator()

    while True:
        query = input('2018-26161> ')
        if query in '/quit':
            print("Okay bye!")
            break
        elif query.startswith('-run'):
            path = query.split()[1].strip()
            if not os.path.exists(path):
                print("경로가 올바르지 않습니다.")
                continue
            # path = 'data/prj2.sched'
            simulator = ScheduleSimulator(path)
            simulator()

        results = search(query)

        for doc in results:
            print(search.result_formatting(doc))
コード例 #12
0
ファイル: data_enhancer.py プロジェクト: xkuang/Digestant
    def enhance(self, num=10):
        from search_engine import SearchEngine
        se = SearchEngine()

        print("*[Data Organizer] Downloading Google Search Results")
        results = se.get_data(queries=self.df['cleaned_text'])

        results_t = []
        d = {}
        for r in results.items():
            tu = []
            for url, text in zip(r[1]['url'], r[1]['text']):
                tu.append((url, text))
            d = {'source_text': r[0], 'result': tu}
            results_t.append(d)

        self.df['google-search'] = [i for i in range(len(self.df))]
        for d, i in zip(self.df['cleaned_text'], range(len(self.df))):
            for r in results_t:
                if str(r['source_text']) == str(d):
                    self.df['google-search'][i] = r['result']

        q = []
        for gs in self.df['google-search']:
            types = dict((el, []) for el in list(self.domains))
            for result in gs:
                _type = self.in_domain(result[0])
                if _type != "": types[_type].append(result)
            q.append(types)

        self.df['types'] = q
        return self.df
コード例 #13
0
 def test_clueweb_search(self):
     se = SearchEngine()
     se.hint_word = '大学'
     se.action_word = '入学'
     se.set_solr_query()
     texts = se.clue_web_search(se.solr_query)
     self.assertEqual(len(texts), 50)
     self.assertEqual('大学' and '入学' in texts[0], True)
コード例 #14
0
def test_search_engine_3():
    test = SearchEngine('test_dir3')

    # Tests Empty Directory
    assert_equals(1, test._num_docs)
    assert_equals({}, test._docs)
    assert_equals(0, test._calculate_idf('Samsung'))
    assert_equals(None, test.search('Samsung'))
コード例 #15
0
def search(query, count):
    search_engine = request.forms.get('search_engine')
    engine = SearchEngine()
    if search_engine == 'google':
        pages = engine.google_search(query, count)
    elif search_engine == 'bing':
        pages = engine.bing_search(query, count)
    return pages
コード例 #16
0
def search_by_google_or_bing(request):
    query = request.form["query"]
    search_engine_name = request.form['search_engine']
    search_engine = SearchEngine()
    if search_engine_name == 'google':
        pages = search_engine.google_search(query, 1)
    else:
        pages = search_engine.bing_search(query, 1)
    return pages
コード例 #17
0
ファイル: search_tests.py プロジェクト: anakru07/proga
 def setUp(self):
     self.engine = SearchEngine('database')
     self.engine.database.update(database)
     test = open("test1.txt", 'w')
     test.write(test1)
     test.close()
     test = open("test2.txt", 'w')
     test.write(test2)
     test.close()
コード例 #18
0
def query_db(query, tables, Descriptions):
    files = get_codes(tables)
    desc_files = get_descriptions(Descriptions)
    docs = generate_documents(files, desc_files)

    engine = SearchEngine(docs)
    search_results = engine.search(query)

    return search_results
コード例 #19
0
def start():
    searchEngine = SearchEngine("data/")

    print("For finish write 'exit'")
    request = input("User: "******"exit"):
        answer = searchEngine.search(request)
        print_result(answer)
        request = input("User: ")
コード例 #20
0
def main():
    test1 = Document('test_docs/test1.txt')
    test2 = Document('test_docs/test2.txt')
    test3 = Document('test_docs/test3.txt')
    test4 = Document('test_docs/test4.txt')
    test_search1 = SearchEngine('test_docs')

    test_document(test1, test2, test3, test4)
    test_single(test_search1)
    test_mulit(test_search1)
コード例 #21
0
ファイル: hw4_test.py プロジェクト: AdamK42/cse163-homework
def test_searchengine_calculate_idf():
    '''
    Tests the term_frequency function.
    '''
    test = SearchEngine(DIRECTORY)

    assert_equals(0, test._calculate_idf('croissant'))
    assert_equals(0, test._calculate_idf('i'))
    assert_equals(math.log(1.5), test._calculate_idf('chocolate'))
    assert_equals(math.log(3), test._calculate_idf('apple'))
コード例 #22
0
def find_related_action_words():
    search_engine = SearchEngine()
    search_engine.action_word = request.form['action_word']
    search_engine.hint_word = request.form['hint_word']
    search_engine.find_related_action_words()
    search_engine.count_action_words()
    search_engine.sort_action_words_count()
    for elem in search_engine.sorted_action_words:
        elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word']
    return render_template('find_related_action_words.tmpl', items=search_engine.result_pages, sorted_action_words=search_engine.sorted_action_words, found_pages=search_engine.material_pages, query=search_engine.actual_query)
コード例 #23
0
def searchidlist(key, selected=0):
    global page
    global doc_id
    se = SearchEngine('../config.ini', 'utf-8')
    flag, id_scores = se.search(key, selected)
    # 返回docid列表
    doc_id = [i for i, s in id_scores]
    page = []
    for i in range(1, (len(doc_id) // 10 + 2)):
        page.append(i)
    return flag, page
コード例 #24
0
def test_search_engine_2():
    test = SearchEngine('test_dir2')

    # Tests large number of words
    assert_equals(4, test._num_docs)
    assert_equals(.287, test._calculate_idf('Samsung'))
    assert_equals(['ChromeBook.html', 'att.html', 'facebook.html'],
                  test.search('Samsung'))
    assert_equals(['ChromeBook.html', 'att.html', 'facebook.html'],
                  test.search('Samsung companies!'))
    assert_equals(None, test.search('adksamfk'))
コード例 #25
0
ファイル: hw4_test.py プロジェクト: AdamK42/cse163-homework
def test_searchengine_search():
    '''
    Tests the search function.
    '''
    test = SearchEngine(DIRECTORY)

    expected = [FILE1]
    assert_equals(expected, test.search('super'))
    assert_equals(None, test.search('croissant'))
    expected = [FILE1, FILE2]
    assert_equals(expected, test.search('Apple pie'))
コード例 #26
0
    def __init__(self,
                 sche_path,
                 log_path='prj2.log',
                 search_path='search.txt'):
        self.search_path = search_path

        self.db = Datasource()
        self.scheduler = ScheduleParser(sche_path)
        self.recovery = RecoveryManagement(log_path)
        self.log_writer = LogWriter(log_path)
        self.search = SearchEngine()
        self.generator = SearchEngineGenerator()
コード例 #27
0
 def setUp(self):
     index = indexer.Indexer('dbase')        
     f = open('test.txt', 'w')
     f.write('this is\ntest')
     f.close()
     t = open('tst.txt', 'w')
     t.write('test')
     t.close()        
     index.indexing_with_lines('test.txt')
     index.indexing_with_lines('tst.txt')
     del index
     self.s = SearchEngine('dbase')
コード例 #28
0
def test_http_error_calling_keyword():
    """
    Tests re raising of status code 400 when calling keywords endpoint
    """
    responses.add(responses.POST, constants.MOCK_URL_KEYWORDS, status=400)

    with mock.patch('env.get_keyword_endpoint',
                    return_value=constants.MOCK_URL_KEYWORDS):
        with pytest.raises(requests.HTTPError):
            SearchEngine().query("Some super real query")
    assert len(responses.calls) == 1
    assert responses.calls[0].request.url == constants.MOCK_URL_KEYWORDS
    assert responses.calls[0].response.status_code == 400
コード例 #29
0
def test_search_engine_class():
    """
    This function tests the correctness of the functions
    implemented in SearchEngine class.
    """
    search_engine = SearchEngine('test-files')
    assert_equals(math.log(3/2), search_engine._calculate_idf('dogs'))
    fir_search = search_engine.search('dogs')
    assert_equals(['test-files/doc3.txt', 'test-files/doc1.txt'], fir_search)
    sec_search = search_engine.search('Cats very cute')
    assert_equals(['test-files/doc2.txt'], sec_search)
    thir_search = search_engine.search('happy')
    assert_equals(None, thir_search)
コード例 #30
0
 def setUp(self):
     index = indexer.Indexer('dbase')        
     f = open('test.txt', 'w')
     f.write('this is a test required for helping students create a test\n')
     f.write(' professor required to write a test first')
     f.close()
     t = open('tst.txt', 'w')
     t.write('test is required. On the other hand...')
     t.close()        
     index.indexing_with_lines('test.txt')
     index.indexing_with_lines('tst.txt')
     del index
     self.s = SearchEngine('dbase')