def setUp(self): ''' In this method we create an indexer, create a text file, index it, then delete the file and the indexer. Then create an object of SearchEngine() ''' indexer = ToIndex('database') self.maxDiff = None text = open('test_text.txt', 'w') text.write('Ах, не говорите мне про Австрию! \ Я ничего не понимаю, может быть') text.close() another_text = open('another_test_text.txt', 'w') another_text.write('но Ах Австрия никогда не хотела и не хочет войны.\ Она предает нас') another_text.close() text1 = open('test_text1.txt', 'w') text1.write('ooh la la мама мыла раму123 frf34') text1.close() text2 = open('test_text2.txt', 'w') text2.write('мама мыла окно') text2.close() text3 = open('test_text3.txt', 'w') text3.write('мама мыла еще что-нибудь') text3.close() indexer.index_by_line('test_text.txt') indexer.index_by_line('another_test_text.txt') del indexer self.search_eng = SearchEngine('database')
def print_result(self): self.query = self.entry.get() My_search_engine = SearchEngine(self.db) self.top_level = Toplevel(self.tk) self.top_level.geometry("500x500") self.top_level.title("The Result") result = My_search_engine.print_results(self.query) return_doc = My_search_engine.get_return_doc() if len(self.query) == 0: label = Label(self.top_level, text="Please enter your text again") elif len(result) == 0: label = Label(self.top_level, text="No result found") else: label = Label(self.top_level, text="These are the top 20 results for query " + self.query + ": ") label.pack() i = 0 for single_result in result: single_link_and_snippet = Label( self.top_level, text=single_result + " Its snippet: " + self.snippet_dict[return_doc[i]] + "...", fg="blue", cursor="hand2") single_link_and_snippet.pack() #single_snippet = Label(self.top_level, text="its snippet: "+ self.snippet_dict[return_doc[i]]) i += 1 single_link_and_snippet.bind("<Button-1>", self.show_content)
def __main__(argv): #%% logger = logging.getLogger(__name__) logger.info("VECTOR MODEL INFORMATION RETRIEVAL SYSTEM START") gli = InvertedIndexGenerator(GLI_CONFIG_FILE) gli.run() gli.write_output() index = Indexer(INDEX_CONFIG_FILE, TfidfVectorizer) index.run() index.write_output() pc = QueryProcessor(PC_CONFIG_FILE) pc.run() pc.write_output() buscador = SearchEngine(BUSCA_CONFIG_FILE, TfidfVectorizer) buscador.run() buscador.write_output() #%% avaliador = Evaluator(AVAL_CONFIG_FILE) avaliador.run() avaliador.write_output() logger.info("VECTOR MODEL INFORMATION RETRIEVAL SYSTEM DONE")
def test_search_engine(): """Tests the SearchEngine class""" engine = SearchEngine('test_search') assert_equals(None, engine.search('asdf')) assert_equals(['test_search/file.txt'], engine.search('dog')) assert_equals(['test_search/file.txt', 'test_search/file1.txt'], engine.search('test'))
def test_find_pages(self): page_1 = WebPage("http://tradein.nissan.co.jp/") page_1.title = "自動車の下取りと売却" page_1.snippet = "自動車には下取りをする方法がけっこうある。" page_2 = WebPage("http://www.link-nexus.com/") page_2.title = "自動車の下取りと販売" page_2.snippet = "あばばばばば" page_3 = WebPage("http://toyota.jp/service/tradein/dc/top") page_3.title = "下取り参考価格情報" page_3.snippet = "下取りと販売ですよプロデューサーさん" search_engine = SearchEngine() search_engine.material_pages = [page_1, page_2, page_3] search_engine.hint_word = "自動車" search_engine.action_word = "下取り" search_engine.find_pages_including_related_words() self.assertEqual(search_engine.result_pages[0], page_1) self.assertEqual(search_engine.result_pages[1], page_2) self.assertEqual(search_engine.result_pages[2], page_3) search_engine.count_action_words() self.assertEqual(search_engine.action_words_count, {"販売": 2, "売却": 1}) search_engine.sort_action_words_count() self.assertEqual(search_engine.sorted_action_words, [{"word": "販売", "count": 2}, {"word": "売却", "count": 1}])
def searchidlist(key, selected=0): global page global doc_id se = SearchEngine('../config.ini', 'utf-8') flag, id_scores = se.search(key, selected) # 返回docid列表 doc_id = [i for i, s in id_scores] # TODO 根据用户等级过滤掉部分数据 global dir_path, db_path, rank conn = sqlite3.connect(db_path) c = conn.cursor() doc_id_rank = [] for id in doc_id: c.execute("SELECT * FROM news WHERE id=?", (id, )) fetch = c.fetchone() if fetch[5] < rank: doc_id_rank.append(id) doc_id.clear() doc_id = doc_id_rank.copy() print("**rank**: ", rank) print("**doc_id**: ", doc_id) page = [] for i in range(1, (len(doc_id) // 10 + 2)): page.append(i) return flag, page
def main(): # Instance generator = SearchEngineGenerator() search = SearchEngine() # Generate invert index ans = input('Do you want to build scores?[type \'y\' to build or pass]: ') if ans == 'y': generator() # generator() while True: query = input('2018-26161> ') if query in '/quit': print("Okay bye!") break elif query.startswith('-run'): path = query.split()[1].strip() if not os.path.exists(path): print("경로가 올바르지 않습니다.") continue # path = 'data/prj2.sched' simulator = ScheduleSimulator(path) simulator() results = search(query) for doc in results: print(search.result_formatting(doc))
def enhance(self, num=10): from search_engine import SearchEngine se = SearchEngine() print("*[Data Organizer] Downloading Google Search Results") results = se.get_data(queries=self.df['cleaned_text']) results_t = [] d = {} for r in results.items(): tu = [] for url, text in zip(r[1]['url'], r[1]['text']): tu.append((url, text)) d = {'source_text': r[0], 'result': tu} results_t.append(d) self.df['google-search'] = [i for i in range(len(self.df))] for d, i in zip(self.df['cleaned_text'], range(len(self.df))): for r in results_t: if str(r['source_text']) == str(d): self.df['google-search'][i] = r['result'] q = [] for gs in self.df['google-search']: types = dict((el, []) for el in list(self.domains)) for result in gs: _type = self.in_domain(result[0]) if _type != "": types[_type].append(result) q.append(types) self.df['types'] = q return self.df
def search(query, count): search_engine = request.forms.get('search_engine') engine = SearchEngine() if search_engine == 'google': pages = engine.google_search(query, count) elif search_engine == 'bing': pages = engine.bing_search(query, count) return pages
def test_search_engine_3(): test = SearchEngine('test_dir3') # Tests Empty Directory assert_equals(1, test._num_docs) assert_equals({}, test._docs) assert_equals(0, test._calculate_idf('Samsung')) assert_equals(None, test.search('Samsung'))
def query_db(query, tables, Descriptions): files = get_codes(tables) desc_files = get_descriptions(Descriptions) docs = generate_documents(files, desc_files) engine = SearchEngine(docs) search_results = engine.search(query) return search_results
def start(): searchEngine = SearchEngine("data/") print("For finish write 'exit'") request = input("User: "******"exit"): answer = searchEngine.search(request) print_result(answer) request = input("User: ")
def search_by_google_or_bing(request): query = request.form["query"] search_engine_name = request.form['search_engine'] search_engine = SearchEngine() if search_engine_name == 'google': pages = search_engine.google_search(query, 1) else: pages = search_engine.bing_search(query, 1) return pages
def setUp(self): self.engine = SearchEngine('database') self.engine.database.update(database) test = open("test1.txt", 'w') test.write(test1) test.close() test = open("test2.txt", 'w') test.write(test2) test.close()
class TestSearchEngine(unittest.TestCase): def setUp(self): ''' Create an indexer, create a text file, index it, then delete the file and the indexer. Then create an object of SearchEngine() ''' indexer = ToIndex('database') self.maxDiff = None text = open('test_text.txt', 'w') text.write('mama мыла ramu') text.close() indexer.index_by_line('test_text.txt') del indexer self.search_eng = SearchEngine('database') def tearDown(self): ''' In this method we destroy an object of SearchEngine() and delete 'database'. ''' del self.search_eng files = os.listdir() for single_file in files: if single_file == "database": os.remove(single_file) else: if single_file.startswith('database.'): os.remove(single_file) os.remove('test_text.txt') def test_empty_query(self): ''' Test that ValueError is raised if the query is an empty string. ''' with self.assertRaises(ValueError): self.search_eng.search("") def test_query_is_a_number(self): """ If the query is a number raise TypeError. """ with self.assertRaises(TypeError): self.search_eng.search(42) def test_program_runs_okay(self): ''' Test that program runs as expected given there is one word in the query and one file in the database. ''' search_res = self.search_eng.search('мыла') ref_dict = {'test_text.txt': [PositionByLine(5, 9, 0)]} self.assertEqual(ref_dict, search_res)
def find_related_action_words(): search_engine = SearchEngine() search_engine.action_word = request.form['action_word'] search_engine.hint_word = request.form['hint_word'] search_engine.find_related_action_words() search_engine.count_action_words() search_engine.sort_action_words_count() for elem in search_engine.sorted_action_words: elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word'] return render_template('find_related_action_words.tmpl', items=search_engine.result_pages, sorted_action_words=search_engine.sorted_action_words, found_pages=search_engine.material_pages, query=search_engine.actual_query)
class QASystem(object): def __init__(self): if exists('database.dat'): # deserialize database is much faster. print('deserialize the QA database...') self.search_engine = SearchEngine('cc/cppjieba/dict', 'database.dat') else: # load database from txt is slower. print('load from QA database from txt format...') self.search_engine = SearchEngine('cc/cppjieba/dict') self.search_engine.loadFromTxt('question_answer.txt') self.search_engine.save('database.dat') self.predictor = Predictor() def query(self, question, count=3): answer_scores = self.search_engine.query(question, count) answer_totalscores = dict() for answer, match in answer_scores.items(): _, relevance = self.predictor.predict(question, answer) answer_totalscores[answer] = ( log(max(match[0], sys.float_info.min)) * relevance, match[1], ) return answer_totalscores def updateDB(self, file): assert type(file) is str self.search_engine.loadFromTxt(file) self.search_engine.save('database.dat')
class QASystem(object): def __init__(self): if exists('database.dat'): # deserialize database is much faster. print('deserialize the QA database...') self.search_engine = SearchEngine('cppjieba/dict', 'database.dat') else: # load database from txt is slower. print('load from QA database from txt format...') self.search_engine = SearchEngine('cppjieba/dict') self.search_engine.loadFromTxt('question_answer.txt') self.search_engine.save('database.dat') self.predictor = Predictor() def query(self, question, count=3): answer_scores = self.search_engine.query(question, count) answer_totalscores = dict() for answer, match in answer_scores.items(): _, relevance = self.predictor.predict(question, answer) answer_totalscores[answer] = exp(match) + exp(relevance) # sort in descend order of total score sorted(answer_totalscores, key=operator.itemgetter(1), reverse=True) return answer_totalscores def updateDB(self, file): assert type(file) is str self.search_engine.loadFromTxt(file) self.search_engine.save('database.dat')
def searchidlist(key, selected=0): global page global doc_id se = SearchEngine('../config.ini', 'utf-8') flag, id_scores = se.search(key, selected) # 返回docid列表 doc_id = [i for i, s in id_scores] page = [] for i in range(1, (len(doc_id) // 10 + 2)): page.append(i) return flag,page
def test_searchengine_search(): ''' Tests the search function. ''' test = SearchEngine(DIRECTORY) expected = [FILE1] assert_equals(expected, test.search('super')) assert_equals(None, test.search('croissant')) expected = [FILE1, FILE2] assert_equals(expected, test.search('Apple pie'))
def searchidlist(key, selected=0): global page global doc_id se = SearchEngine('../config.ini', 'utf-8') flag, id_scores = se.search(key, selected) # 返回docid列表 doc_id = [i for i, s in id_scores] page = [] for i in range(1, (len(doc_id) // 10 + 2)): page.append(i) return flag, page
def setUp(self): index = indexer.Indexer('dbase') f = open('test.txt', 'w') f.write('this is\ntest') f.close() t = open('tst.txt', 'w') t.write('test') t.close() index.indexing_with_lines('test.txt') index.indexing_with_lines('tst.txt') del index self.s = SearchEngine('dbase')
def setUp(self): index = indexer.Indexer('dbase') f = open('test.txt', 'w') f.write('this is a test required for helping students create a test\n') f.write(' professor required to write a test first') f.close() t = open('tst.txt', 'w') t.write('test is required. On the other hand...') t.close() index.indexing_with_lines('test.txt') index.indexing_with_lines('tst.txt') del index self.s = SearchEngine('dbase')
def __init__(self): if exists('database.dat'): # deserialize database is much faster. print('deserialize the QA database...') self.search_engine = SearchEngine('cppjieba/dict', 'database.dat') else: # load database from txt is slower. print('load from QA database from txt format...') self.search_engine = SearchEngine('cppjieba/dict') self.search_engine.loadFromTxt('question_answer.txt') self.search_engine.save('database.dat') self.predictor = Predictor()
def test_find_related_action_words_from_clueweb(self): se = SearchEngine() se.hint_word = '大学' se.action_word = '入学' se.set_solr_query() se.find_related_action_words_from_clueweb() self.assertEqual(len(se.result_pages), 1)
def setUp(self): ''' Create an indexer, create a text file, index it, then delete the file and the indexer. Then create an object of SearchEngine() ''' indexer = ToIndex('database') self.maxDiff = None text = open('test_text.txt', 'w') text.write('mama мыла ramu') text.close() indexer.index_by_line('test_text.txt') del indexer self.search_eng = SearchEngine('database')
def test_searchengine_fields(): ''' Tests the fields of a search engine after construction. ''' test = SearchEngine(DIRECTORY) doc1 = Document(FILE1) doc2 = Document(FILE2) doc3 = Document(FILE3) assert_equals(3, test._total_documents) expected = { 'i': [doc1, doc2, doc3], 'like': [doc1, doc2], 'apple': [doc1], 'pie': [doc1, doc2], 'is': [doc1, doc3], 'super': [doc1], 'duper': [doc1], 'cool': [doc1, doc3], 'also': [doc2], 'chocolate': [doc2, doc3], 'cake': [doc3], 'guess': [doc3] } assert_equals(expected, test._all_terms)
def test_find_related_action_words_from_clueweb(self): se = SearchEngine() se.hint_word = "大学" se.action_word = "入学" se.set_solr_query() se.find_related_action_words_from_clueweb() self.assertEqual(len(se.result_pages), 1)
class TestSearchEngine(unittest.TestCase): def setUp(self): self.engine = SearchEngine('database') self.engine.database.update(database) test = open("test1.txt", 'w') test.write(test1) test.close() test = open("test2.txt", 'w') test.write(test2) test.close() def test_empty(self): result = self.engine.single_token_search('') self.assertEqual(result, {}) def test_search_one(self): result = self.engine.single_token_search('for') self.assertEqual(result, {'test2.txt': [(15, 18)]}) def test_search_many_one(self): result = self.engine.multiple_tokens_search('testing') self.assertEqual( result, { 'test1.txt': [Position_with_lines(11, 18, 0)], 'test2.txt': [Position_with_lines(0, 7, 0)] }) def test_search_many_two(self): result = self.engine.multiple_tokens_search('testing ground') self.assertEqual( result, { 'test1.txt': [ Position_with_lines(11, 18, 0), Position_with_lines(19, 24, 0) ], 'test2.txt': [Position_with_lines(0, 7, 0), Position_with_lines(8, 14, 0)] }) def tearDown(self): if 'test1.txt' in os.listdir(os.getcwd()): os.remove('test1.txt') if 'test2.txt' in os.listdir(os.getcwd()): os.remove('test2.txt')
def search(self, key_search): data = self.select('content') searcher = SearchEngine(key_search) result = [] for d in data: content = d.get('content', ' ') if len(content.strip()) < 4: continue if len( re.findall('|'.join(key_search.lower().split()), content.lower())) == 0: continue titles = d.get('tieu_de', ' ').split('|') score = 0 try: ok = True for i, title in enumerate(titles): score_tieu_de, _ = searcher.LCS4Sentence(u'' + title) score += (i + 1) / len(titles) * score_tieu_de * 2 sentences = content.split('.') score_content = 0 n_content = 0 for sentence in sentences: s_content, index = searcher.LCS4Sentence(u'' + sentence) # score_content += s_content # if s_content > 0: # n_content += 1 if s_content > score_content: score_content = s_content score_content = 2 * score_content / (n_content + 1) _, index = searcher.LCS4Sentence(u'' + content) score += score_content if score < 0.3: continue index = ';'.join(['{0}-{1}'.format(s, e) for s, e in index]) reference = d.get('reference') # d['stt'] result.append({ 'reference': reference, 'title': title.split('|')[-1], 'content': content, 'score': score, 'index': index }) except Exception as e: print(e) ok = False # from text_mining.search_engine.search_engine import SearchEngine searcher = SearchEngine(key_search) if not ok: print('error') # print(searcher.LCS4Sentence('haha')) result = [r for r in result if r['score'] > 0.05] result = sorted(result, key=lambda r: r['score'], reverse=True) result = result[:20] return result
def test_clueweb_search(self): se = SearchEngine() se.hint_word = '大学' se.action_word = '入学' se.set_solr_query() texts = se.clue_web_search(se.solr_query) self.assertEqual(len(texts), 50) self.assertEqual('大学' and '入学' in texts[0], True)
def get_all_search_results(self, request, **values): if not request.args.has_key('q'): raise BadRequest('please provide a search query \'q\'') engine = SearchEngine() try: query = request.args['q'] limit = int(request.args['limit']) if request.args.has_key('limit') else 10 service = request.args['service'] if request.args.has_key('service') else 'all' except: return BadRequest('error parsing request. make sure limit is an integer') sc_res = [] yt_res = [] # query each of the services provided by the engine if service == 'all' or service == 'soundcloud': sc_res = engine.soundcloud_query(query, limit) if service == 'all' or service == 'youtube': yt_res = engine.youtube_query(query, limit) return self.render_template('results.txt', results=sc_res + yt_res)
def main(): print("Building SearchEngine") engine = SearchEngine() answer = 'y' while answer == 'y': term = input('Enter Search Term:') ranking = engine.search(term) print("Displaying results for " + "'" + term + "':") if ranking is None: print("No results") rank = 1 for doc in ranking: print(' ' + str(rank) + '. ' + doc) rank += 1 print() answer = '' while not (answer == 'y' or answer == 'n'): answer = input('Would you like to search another term (y/n) ')
def main(): test1 = Document('test_docs/test1.txt') test2 = Document('test_docs/test2.txt') test3 = Document('test_docs/test3.txt') test4 = Document('test_docs/test4.txt') test_search1 = SearchEngine('test_docs') test_document(test1, test2, test3, test4) test_single(test_search1) test_mulit(test_search1)
def search_service(request): """ Parses http json request and returns list of articles and their scores depending on the input query Args: request: http POST body request as json Returns: list of articles with their respective scores """ json = request.get_json() logging.info(json) if "query" not in json: message = "ValueError: Expected 'query' field in json body missing" error = {"error": {"message": message}} logging.error(message) return error query = json["query"] search = SearchEngine(keywords_weight=constants.KEYWORDS_WEIGHT) try: score_per_article = search.query(query) except Exception as e: error = { "error": { "message": getattr(e, 'message', str(e)), "trace": traceback.format_exc() } } logging.error(error['error']) return error # sorts dictionary by value in DESC order articles_sorted = [ k for k, v in sorted( score_per_article.items(), key=lambda item: item[1], reverse=True) ] response = {"articles": articles_sorted} logging.info(response) return response
def test_clueweb_search(self): se = SearchEngine() se.hint_word = "大学" se.action_word = "入学" se.set_solr_query() texts = se.clue_web_search(se.solr_query) self.assertEqual(len(texts), 50) self.assertEqual("大学" and "入学" in texts[0], True)
def search_and_fetch_30_pages(): search_engine = SearchEngine() pages = search_engine.google_search(QUERY, 3) [page.fetch_html() for page in pages] return pages
from search_engine import SearchEngine if __name__ == '__main__': query = '花粉症対策' engine = SearchEngine() pages = engine.google_search('"' + 'で' + query + '"', 3) for page in pages: try: i = page.title.index('で' + query) print(page.title[(i - 10):]) except ValueError: try: i = page.snippet.index('で' + query) print(page.snippet[(i - 10):]) except: pass
from search_engine import SearchEngine search = SearchEngine() welcome = "Seach by \"Title\", \"Call Number\", \"Subjects\", \"Other\" or \"Quit\": " while True: input = raw_input(welcome) if input.upper().strip() == "QUIT": break search_string = raw_input("Please enter a search string: ") if input.upper().strip() == "TITLE": search.search_by_title(search_string) elif (input.upper().strip() == "CALL NUMBER" or input.upper().strip() == "CALLNUMBER"): search.search_by_call_number(search_string) elif (input.upper().strip() == "SUBJECTS" or input.upper().strip() == "SUBJECT"): search.search_by_subjects(search_string) elif input.upper().strip() == "OTHER": search.search_by_other(search_string)
def search_in_clueweb_with_expanded_query(): search_engine = SearchEngine() search_engine.action_word = request.form['action_word'] search_engine.hint_word = request.form['hint_word'] search_engine.find_related_action_words_with_google() search_engine.count_action_words() search_engine.sort_action_words_count() search_engine.pick_sorted_action_words_more_than_1_count() results = [] for elem in search_engine.sorted_action_words_more_than_1_count: elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word'] url = 'http://karen.dl.local:8983/solr/ClueWeb09ja/select?q=' + elem['expanded_query'] + '&wt=xml' web_page = WebPage(url) web_page.fetch_xml() web_page.pick_texts_to_result_pages() # クエリ1つごとに結果xmlページがある # 結果xmlページの内容を1ページずつWebPageオブジェクトにしてresult_pagesとして1クエリに対応する結果ページに持たせる for result_page in web_page.result_pages: # result_page.text_body result_page.set_lines_from_texts() result_page.set_line_nums_with_word(search_engine.action_word) result_page.set_line_nums_around_action_word() result_page.set_line_clusters_around_action_word() # web_page.result_pages[0].line_clusters_around_action_word results.append({'pages': web_page.result_pages, 'expanded_query': elem['expanded_query']}) return render_template('search_in_clueweb_with_expanded_query.tmpl', results=results)
def search(): keyword = request.args.get('keyword', '') engine = SearchEngine() pages_searched = engine.search(keyword) return render_template( 'results.html', keyword=keyword, pages_searched=pages_searched)
def google_search(self): engine = SearchEngine() pages = engine.google_search(self.query, self.search_num) return pages
""" fetchnetapps = fetchNetApps(applications) fetchnetapps.download_apps() fetchnetapps.decompress_apps() dbhandle = dbHandle(constants, functions, structures, applications) apps_in_analysis_db = dbhandle.apps_analysis_is_done() search_engine = SearchEngine(all_socket_api) #count all socket APIs under applications directory apps_dir = os.path.join(os.environ['PWD'],'applications') #buggy Here #for name in os.listdir(apps_dir): for conf_name in applications: #check whether it is the right application download based on configuration file # FIX ME: not try to get #for conf_name in applications: for name in os.listdir(apps_dir): path = os.path.join(apps_dir, name) #print name #print conf_name
def do_something(): query = request.args.get('query', '') se = SearchEngine() results = se.make_query(query) return render_template('result.html', data=results)