def search(self, key_search): data = self.select('content') searcher = SearchEngine(key_search) result = [] for d in data: content = d.get('content', ' ') if len(content.strip()) < 4: continue if len( re.findall('|'.join(key_search.lower().split()), content.lower())) == 0: continue titles = d.get('tieu_de', ' ').split('|') score = 0 try: ok = True for i, title in enumerate(titles): score_tieu_de, _ = searcher.LCS4Sentence(u'' + title) score += (i + 1) / len(titles) * score_tieu_de * 2 sentences = content.split('.') score_content = 0 n_content = 0 for sentence in sentences: s_content, index = searcher.LCS4Sentence(u'' + sentence) # score_content += s_content # if s_content > 0: # n_content += 1 if s_content > score_content: score_content = s_content score_content = 2 * score_content / (n_content + 1) _, index = searcher.LCS4Sentence(u'' + content) score += score_content if score < 0.3: continue index = ';'.join(['{0}-{1}'.format(s, e) for s, e in index]) reference = d.get('reference') # d['stt'] result.append({ 'reference': reference, 'title': title.split('|')[-1], 'content': content, 'score': score, 'index': index }) except Exception as e: print(e) ok = False # from text_mining.search_engine.search_engine import SearchEngine searcher = SearchEngine(key_search) if not ok: print('error') # print(searcher.LCS4Sentence('haha')) result = [r for r in result if r['score'] > 0.05] result = sorted(result, key=lambda r: r['score'], reverse=True) result = result[:20] return result
def __init__(self): if exists('database.dat'): # deserialize database is much faster. print('deserialize the QA database...') self.search_engine = SearchEngine('cppjieba/dict', 'database.dat') else: # load database from txt is slower. print('load from QA database from txt format...') self.search_engine = SearchEngine('cppjieba/dict') self.search_engine.loadFromTxt('question_answer.txt') self.search_engine.save('database.dat') self.predictor = Predictor()
def search_in_clueweb_with_expanded_query(): search_engine = SearchEngine() search_engine.action_word = request.form['action_word'] search_engine.hint_word = request.form['hint_word'] search_engine.find_related_action_words_with_google() search_engine.count_action_words() search_engine.sort_action_words_count() search_engine.pick_sorted_action_words_more_than_1_count() results = [] for elem in search_engine.sorted_action_words_more_than_1_count: elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word'] url = 'http://karen.dl.local:8983/solr/ClueWeb09ja/select?q=' + elem['expanded_query'] + '&wt=xml' web_page = WebPage(url) web_page.fetch_xml() web_page.pick_texts_to_result_pages() # クエリ1つごとに結果xmlページがある # 結果xmlページの内容を1ページずつWebPageオブジェクトにしてresult_pagesとして1クエリに対応する結果ページに持たせる for result_page in web_page.result_pages: # result_page.text_body result_page.set_lines_from_texts() result_page.set_line_nums_with_word(search_engine.action_word) result_page.set_line_nums_around_action_word() result_page.set_line_clusters_around_action_word() # web_page.result_pages[0].line_clusters_around_action_word results.append({'pages': web_page.result_pages, 'expanded_query': elem['expanded_query']}) return render_template('search_in_clueweb_with_expanded_query.tmpl', results=results)
def test_searchengine_fields(): ''' Tests the fields of a search engine after construction. ''' test = SearchEngine(DIRECTORY) doc1 = Document(FILE1) doc2 = Document(FILE2) doc3 = Document(FILE3) assert_equals(3, test._total_documents) expected = { 'i': [doc1, doc2, doc3], 'like': [doc1, doc2], 'apple': [doc1], 'pie': [doc1, doc2], 'is': [doc1, doc3], 'super': [doc1], 'duper': [doc1], 'cool': [doc1, doc3], 'also': [doc2], 'chocolate': [doc2, doc3], 'cake': [doc3], 'guess': [doc3] } assert_equals(expected, test._all_terms)
def setUp(self): ''' In this method we create an indexer, create a text file, index it, then delete the file and the indexer. Then create an object of SearchEngine() ''' indexer = ToIndex('database') self.maxDiff = None text = open('test_text.txt', 'w') text.write('Ах, не говорите мне про Австрию! \ Я ничего не понимаю, может быть') text.close() another_text = open('another_test_text.txt', 'w') another_text.write('но Ах Австрия никогда не хотела и не хочет войны.\ Она предает нас') another_text.close() text1 = open('test_text1.txt', 'w') text1.write('ooh la la мама мыла раму123 frf34') text1.close() text2 = open('test_text2.txt', 'w') text2.write('мама мыла окно') text2.close() text3 = open('test_text3.txt', 'w') text3.write('мама мыла еще что-нибудь') text3.close() indexer.index_by_line('test_text.txt') indexer.index_by_line('another_test_text.txt') del indexer self.search_eng = SearchEngine('database')
def test_find_related_action_words_from_clueweb(self): se = SearchEngine() se.hint_word = '大学' se.action_word = '入学' se.set_solr_query() se.find_related_action_words_from_clueweb() self.assertEqual(len(se.result_pages), 1)
def test_find_pages(self): page_1 = WebPage('http://tradein.nissan.co.jp/') page_1.title = '自動車の下取りと売却' page_1.snippet = '自動車には下取りをする方法がけっこうある。' page_2 = WebPage('http://www.link-nexus.com/') page_2.title = '自動車の下取りと販売' page_2.snippet = 'あばばばばば' page_3 = WebPage('http://toyota.jp/service/tradein/dc/top') page_3.title = '下取り参考価格情報' page_3.snippet = '下取りと販売ですよプロデューサーさん' search_engine = SearchEngine() search_engine.material_pages = [page_1, page_2, page_3] search_engine.hint_word = '自動車' search_engine.action_word = '下取り' search_engine.find_pages_including_related_words() self.assertEqual(search_engine.result_pages[0], page_1) self.assertEqual(search_engine.result_pages[1], page_2) self.assertEqual(search_engine.result_pages[2], page_3) search_engine.count_action_words() self.assertEqual(search_engine.action_words_count, {'販売': 2, '売却': 1}) search_engine.sort_action_words_count() self.assertEqual(search_engine.sorted_action_words, [{ 'word': '販売', 'count': 2 }, { 'word': '売却', 'count': 1 }])
def print_result(self): self.query = self.entry.get() My_search_engine = SearchEngine(self.db) self.top_level = Toplevel(self.tk) self.top_level.geometry("500x500") self.top_level.title("The Result") result = My_search_engine.print_results(self.query) return_doc = My_search_engine.get_return_doc() if len(self.query) == 0: label = Label(self.top_level, text="Please enter your text again") elif len(result) == 0: label = Label(self.top_level, text="No result found") else: label = Label(self.top_level, text="These are the top 20 results for query " + self.query + ": ") label.pack() i = 0 for single_result in result: single_link_and_snippet = Label( self.top_level, text=single_result + " Its snippet: " + self.snippet_dict[return_doc[i]] + "...", fg="blue", cursor="hand2") single_link_and_snippet.pack() #single_snippet = Label(self.top_level, text="its snippet: "+ self.snippet_dict[return_doc[i]]) i += 1 single_link_and_snippet.bind("<Button-1>", self.show_content)
def test_search_engine(): """Tests the SearchEngine class""" engine = SearchEngine('test_search') assert_equals(None, engine.search('asdf')) assert_equals(['test_search/file.txt'], engine.search('dog')) assert_equals(['test_search/file.txt', 'test_search/file1.txt'], engine.search('test'))
def searchidlist(key, selected=0): global page global doc_id se = SearchEngine('../config.ini', 'utf-8') flag, id_scores = se.search(key, selected) # 返回docid列表 doc_id = [i for i, s in id_scores] # TODO 根据用户等级过滤掉部分数据 global dir_path, db_path, rank conn = sqlite3.connect(db_path) c = conn.cursor() doc_id_rank = [] for id in doc_id: c.execute("SELECT * FROM news WHERE id=?", (id, )) fetch = c.fetchone() if fetch[5] < rank: doc_id_rank.append(id) doc_id.clear() doc_id = doc_id_rank.copy() print("**rank**: ", rank) print("**doc_id**: ", doc_id) page = [] for i in range(1, (len(doc_id) // 10 + 2)): page.append(i) return flag, page
def main(): # Instance generator = SearchEngineGenerator() search = SearchEngine() # Generate invert index ans = input('Do you want to build scores?[type \'y\' to build or pass]: ') if ans == 'y': generator() # generator() while True: query = input('2018-26161> ') if query in '/quit': print("Okay bye!") break elif query.startswith('-run'): path = query.split()[1].strip() if not os.path.exists(path): print("경로가 올바르지 않습니다.") continue # path = 'data/prj2.sched' simulator = ScheduleSimulator(path) simulator() results = search(query) for doc in results: print(search.result_formatting(doc))
def enhance(self, num=10): from search_engine import SearchEngine se = SearchEngine() print("*[Data Organizer] Downloading Google Search Results") results = se.get_data(queries=self.df['cleaned_text']) results_t = [] d = {} for r in results.items(): tu = [] for url, text in zip(r[1]['url'], r[1]['text']): tu.append((url, text)) d = {'source_text': r[0], 'result': tu} results_t.append(d) self.df['google-search'] = [i for i in range(len(self.df))] for d, i in zip(self.df['cleaned_text'], range(len(self.df))): for r in results_t: if str(r['source_text']) == str(d): self.df['google-search'][i] = r['result'] q = [] for gs in self.df['google-search']: types = dict((el, []) for el in list(self.domains)) for result in gs: _type = self.in_domain(result[0]) if _type != "": types[_type].append(result) q.append(types) self.df['types'] = q return self.df
def test_clueweb_search(self): se = SearchEngine() se.hint_word = '大学' se.action_word = '入学' se.set_solr_query() texts = se.clue_web_search(se.solr_query) self.assertEqual(len(texts), 50) self.assertEqual('大学' and '入学' in texts[0], True)
def test_search_engine_3(): test = SearchEngine('test_dir3') # Tests Empty Directory assert_equals(1, test._num_docs) assert_equals({}, test._docs) assert_equals(0, test._calculate_idf('Samsung')) assert_equals(None, test.search('Samsung'))
def search(query, count): search_engine = request.forms.get('search_engine') engine = SearchEngine() if search_engine == 'google': pages = engine.google_search(query, count) elif search_engine == 'bing': pages = engine.bing_search(query, count) return pages
def search_by_google_or_bing(request): query = request.form["query"] search_engine_name = request.form['search_engine'] search_engine = SearchEngine() if search_engine_name == 'google': pages = search_engine.google_search(query, 1) else: pages = search_engine.bing_search(query, 1) return pages
def setUp(self): self.engine = SearchEngine('database') self.engine.database.update(database) test = open("test1.txt", 'w') test.write(test1) test.close() test = open("test2.txt", 'w') test.write(test2) test.close()
def query_db(query, tables, Descriptions): files = get_codes(tables) desc_files = get_descriptions(Descriptions) docs = generate_documents(files, desc_files) engine = SearchEngine(docs) search_results = engine.search(query) return search_results
def start(): searchEngine = SearchEngine("data/") print("For finish write 'exit'") request = input("User: "******"exit"): answer = searchEngine.search(request) print_result(answer) request = input("User: ")
def main(): test1 = Document('test_docs/test1.txt') test2 = Document('test_docs/test2.txt') test3 = Document('test_docs/test3.txt') test4 = Document('test_docs/test4.txt') test_search1 = SearchEngine('test_docs') test_document(test1, test2, test3, test4) test_single(test_search1) test_mulit(test_search1)
def test_searchengine_calculate_idf(): ''' Tests the term_frequency function. ''' test = SearchEngine(DIRECTORY) assert_equals(0, test._calculate_idf('croissant')) assert_equals(0, test._calculate_idf('i')) assert_equals(math.log(1.5), test._calculate_idf('chocolate')) assert_equals(math.log(3), test._calculate_idf('apple'))
def find_related_action_words(): search_engine = SearchEngine() search_engine.action_word = request.form['action_word'] search_engine.hint_word = request.form['hint_word'] search_engine.find_related_action_words() search_engine.count_action_words() search_engine.sort_action_words_count() for elem in search_engine.sorted_action_words: elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word'] return render_template('find_related_action_words.tmpl', items=search_engine.result_pages, sorted_action_words=search_engine.sorted_action_words, found_pages=search_engine.material_pages, query=search_engine.actual_query)
def searchidlist(key, selected=0): global page global doc_id se = SearchEngine('../config.ini', 'utf-8') flag, id_scores = se.search(key, selected) # 返回docid列表 doc_id = [i for i, s in id_scores] page = [] for i in range(1, (len(doc_id) // 10 + 2)): page.append(i) return flag, page
def test_search_engine_2(): test = SearchEngine('test_dir2') # Tests large number of words assert_equals(4, test._num_docs) assert_equals(.287, test._calculate_idf('Samsung')) assert_equals(['ChromeBook.html', 'att.html', 'facebook.html'], test.search('Samsung')) assert_equals(['ChromeBook.html', 'att.html', 'facebook.html'], test.search('Samsung companies!')) assert_equals(None, test.search('adksamfk'))
def test_searchengine_search(): ''' Tests the search function. ''' test = SearchEngine(DIRECTORY) expected = [FILE1] assert_equals(expected, test.search('super')) assert_equals(None, test.search('croissant')) expected = [FILE1, FILE2] assert_equals(expected, test.search('Apple pie'))
def __init__(self, sche_path, log_path='prj2.log', search_path='search.txt'): self.search_path = search_path self.db = Datasource() self.scheduler = ScheduleParser(sche_path) self.recovery = RecoveryManagement(log_path) self.log_writer = LogWriter(log_path) self.search = SearchEngine() self.generator = SearchEngineGenerator()
def setUp(self): index = indexer.Indexer('dbase') f = open('test.txt', 'w') f.write('this is\ntest') f.close() t = open('tst.txt', 'w') t.write('test') t.close() index.indexing_with_lines('test.txt') index.indexing_with_lines('tst.txt') del index self.s = SearchEngine('dbase')
def test_http_error_calling_keyword(): """ Tests re raising of status code 400 when calling keywords endpoint """ responses.add(responses.POST, constants.MOCK_URL_KEYWORDS, status=400) with mock.patch('env.get_keyword_endpoint', return_value=constants.MOCK_URL_KEYWORDS): with pytest.raises(requests.HTTPError): SearchEngine().query("Some super real query") assert len(responses.calls) == 1 assert responses.calls[0].request.url == constants.MOCK_URL_KEYWORDS assert responses.calls[0].response.status_code == 400
def test_search_engine_class(): """ This function tests the correctness of the functions implemented in SearchEngine class. """ search_engine = SearchEngine('test-files') assert_equals(math.log(3/2), search_engine._calculate_idf('dogs')) fir_search = search_engine.search('dogs') assert_equals(['test-files/doc3.txt', 'test-files/doc1.txt'], fir_search) sec_search = search_engine.search('Cats very cute') assert_equals(['test-files/doc2.txt'], sec_search) thir_search = search_engine.search('happy') assert_equals(None, thir_search)
def setUp(self): index = indexer.Indexer('dbase') f = open('test.txt', 'w') f.write('this is a test required for helping students create a test\n') f.write(' professor required to write a test first') f.close() t = open('tst.txt', 'w') t.write('test is required. On the other hand...') t.close() index.indexing_with_lines('test.txt') index.indexing_with_lines('tst.txt') del index self.s = SearchEngine('dbase')