Esempio n. 1
0
    def __init__(self, seeds=[]):
        """ init with seeds
        
        Init with seeds
        Create/Open a file for storing progress
        """
        #self.settings = settings
        url = next(iter(SEEDS))
        parsed_uri = urlparse(url)
        self.url_base = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

        self.searchengine = SearchEngine()
        self.searchengine.db_connect()

        self.crawl_book = CrawlWorkbook(path=WWW_DIR, url=seeds[0].url)
        self.crawl_book.wb_open()

        # /! Will have to go in a Frontera Middleware at some point
        # retrieve weighted_links, weighted_links_done...
        self.weighted_links = self.crawl_book.weighted_links
        self.weighted_links_done = self.crawl_book.weighted_links_done
        self.ignore_seeds = self.crawl_book.ignore_seeds
        self.ignored_pages = self.crawl_book.ignored_pages

        self.add_seeds(seeds)

        # build requests from weighted_links
        for wl in self.weighted_links:
            self.requests.append(requests.Request(url=wl.url))
        for wl in self.weighted_links_done:
            self.requests_done.append(requests.Request(url=wl.url))

        # ignore list
        ignore_suffixes = [
            '/es/', '/fr/', '/ca/', '/newsletters', '/2021/', '/2020/01/',
            '/2020/02/', '/2020/03/', '/2020/04/', '/2020/05/', '/2020/06/',
            '/2020/07/', '/2020/08/', '/2020/09/', '/2020/10/', '/2019/',
            '/2018/', '/2017/', '/2016/', '/2015/', '/2014/', '/section/world',
            '/video/world', '/section/food', '/section/arts',
            '/section/sports', '/section/science', '/section/books',
            '/section/travel', '/section/realestate', '/section/fashion',
            '/section/technology', '/section/politics', '/section/business',
            '/section/style', '/section/well', '/section/style/love',
            '/section/us', '/section/video', '/section/interactive',
            '/section/magazine', '/international', '/section/t-magazine',
            '/section/live', '/live', '/video', '/interactive',
            '/issue/fashion', '/subscription', '/subscriptions',
            '/section/business/dealbook', '/pages/business/dealbook',
            '/privacy'
        ]
        if not self.ignore_seeds:
            self.ignore_seeds = [
                WeightedLink(url=urljoin(self.url_base, suffix))
                for suffix in ignore_suffixes
            ]
            self.crawl_book.ws_writerows(
                WORKBOOK['crawler']['worksheet']['ignoreseeds']['TITLE'],
                self.ignore_seeds)
Esempio n. 2
0
def index():
    if request.method == 'POST':
        itemName = request.form['nm']
        se = SearchEngine()
        bigw = se.searchBigW(itemName)
        dj=se.searchDavidJones(itemName)
        return render_template('search.html',bigwItems = bigw,djItems=dj)
    else:
        return render_template('search.html',bigwItems = [], djItems = [])
 def test_already_extended_window(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_already_extended_window.txt', 'w')
     test_file_one.write('Alina Zakharova is a student!!')
     test_file_one.close()
     self.indexator.get_index_with_line('test_already_extended_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window(
         'test_already_extended_window.txt', Position_Plus(0, 16, 18), 2)
     os.remove('test_already_extended_window.txt')
Esempio n. 4
0
class SearchController:
    def __init__(self):
        self.searchEngine = SearchEngine()

    def search(self, params):
        print(params)
        search_input = params.get('search_input')
        stop_words = params.get('stop_words')
        lemmatize = bool(params.get('lemmatize'))

        if (stop_words != None):
            search_input = self.searchEngine.deleteStopWords(
                search_input, int(stop_words))

        if (lemmatize == True):
            result = self.searchEngine.search_lemmatized(search_input, 20)
        else:
            result = self.searchEngine.search_unlemmatized(search_input, 20)

        return result
 def test_myError_str_not_found(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_window_five.txt', 'w')
     test_file_one.write('Alina Zakharova is a student')
     test_file_one.close()
     self.indexator.get_index_with_line('test_window_five.txt')
     del self.indexator
     self.search = SearchEngine('database')
     with self.assertRaises(TypeError):
         result = windows.Context_Window.get_window(
             'test_window_five.txt', Position_Plus(3, 21, 28), 3)
     os.remove('test_window_five.txt')
Esempio n. 6
0
class Input():
    s = SearchEngine()
    
    def __init__(self):
        with open('inputLinks.txt', 'r') as file :
            links = file.read().replace('\n', '').replace('\r','')
        links = links.split(',')
        self.s.init(links)
    
    def search(self, query):
        file1 = open("output.txt","w+") 
        file1.write(str(self.s.searchString(query)))
        file1.close() 
        return self.s.searchString(query)
 def test_highlight_window_one(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_highlight_window.txt', 'w')
     test_file_one.write('Alina Zakharova is a student')
     test_file_one.close()
     self.indexator.get_index_with_line('test_highlight_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window('test_highlight_window.txt',
                                                Position_Plus(0, 6, 15), 1)
     result = window.highlight_window()
     output_string = 'Alina <b>Zakharova</b> is'
     self.assertEqual(result, output_string)
     os.remove('test_highlight_window.txt')
 def test_extend_window_rus_one(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window_rus.txt', 'w')
     test_file_one.write('Пьер с грустью слышал над собою насмешки.')
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window_rus.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window(
         'test_extend_window_rus.txt', Position_Plus(0, 0, 4), 1)
     window.extend_window()
     extended_window = Context_Window(
         'Пьер с грустью слышал над собою насмешки.',
         [Position_Plus(0, 0, 4)], 0, 41)
     self.assertEqual(window, extended_window)
 def test_extend_window(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window.txt', 'w')
     test_file_one.write('Alina Zakharova is a student!!')
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window('test_extend_window.txt',
                                                Position_Plus(0, 6, 15), 1)
     window.extend_window()
     extended_window = Context_Window('Alina Zakharova is a student!!',
                                      [Position_Plus(0, 6, 15)], 0, 30)
     self.assertEqual(window, extended_window)
     os.remove('test_extend_window.txt')
 def test_not_crossed(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_not_crossed_window.txt', 'w')
     test_file_one.write('The girl named Alina Zakharova is a student')
     test_file_one.close()
     self.indexator.get_index_with_line('test_not_crossed_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window_A = windows.Context_Window.get_window(
         'test_not_crossed_window.txt', Position_Plus(0, 31, 33), 1)
     window_B = windows.Context_Window.get_window(
         'test_not_crossed_window.txt', Position_Plus(0, 8, 14), 1)
     crossed_AB = window_A.is_crossed(window_B)
     self.assertEqual(False, crossed_AB)
     os.remove('test_not_crossed_window.txt')
Esempio n. 11
0
 def page_save_to_file(self, request, soup):
     """
     save page to a #.html file where # is the title crc32
     TODO: save request
     """
     # eg. /var/www/html/apple.com/256.html
     url_hash = SearchEngine.hash_url(request.url)
     file_name = os.path.join(HTML_DIR,
                              urlsplit(request.url).netloc,
                              str(url_hash) + '.html')
     print(file_name)
     os.makedirs(os.path.join(HTML_DIR,
                              urlsplit(request.url).netloc),
                 exist_ok=True)
     f = open(file_name, 'w')
     f.write(soup.prettify())
     f.close()
 def test_extend_window_rus_two(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window_rus.txt', 'w')
     test_file_one.write(
         'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.'
     )
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window_rus.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window(
         'test_extend_window_rus.txt', Position_Plus(0, 34, 38), 1)
     window.extend_window()
     extended_window = Context_Window(
         'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.',
         [Position_Plus(0, 34, 38)], 0, 119)
     self.assertEqual(window, extended_window)
 def test_extend_window_rus(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_extend_window_rus.txt', 'w')
     test_file_one.write(
         'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.'
     )
     test_file_one.close()
     self.indexator.get_index_with_line('test_extend_window_rus.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window = windows.Context_Window.get_window(
         'test_extend_window_rus.txt', Position_Plus(0, 28, 36), 1)
     window.extend_window()
     extended_window = Context_Window(
         'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.',
         [Position_Plus(0, 28, 36)], 22, 55)
     self.assertEqual(window, extended_window)
 def test_get_window_begin(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_window_three.txt', 'w')
     test_file_one.write('Alina Zakharova is a student')
     test_file_one.close()
     self.indexator.get_index_with_line('test_window_three.txt')
     del self.indexator
     self.search = SearchEngine('database')
     result = windows.Context_Window.get_window('test_window_three.txt',
                                                Position_Plus(0, 0, 5), 1)
     self.win = Context_Window('string', 'positions', 'win_start',
                               'win_end')
     self.win.string = 'Alina Zakharova is a student'
     self.win.positions = [Position_Plus(0, 0, 5)]
     self.win.win_start = 0
     self.win.win_end = 15
     self.assertEqual(result.string, self.win.string)
     self.assertEqual(result.positions, self.win.positions)
     self.assertEqual(result.win_start, self.win.win_start)
     self.assertEqual(result.win_end, self.win.win_end)
     self.assertEqual(result, self.win)
     os.remove('test_window_three.txt')
 def test_get_window_simple_plus(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_window_two.txt', 'w')
     test_file_one.write('Little Alina Zakharova is a linguist student)))')
     test_file_one.close()
     self.indexator.get_index_with_line('test_window_two.txt')
     del self.indexator
     self.search = SearchEngine('database')
     result = windows.Context_Window.get_window('test_window_two.txt',
                                                Position_Plus(0, 23, 25), 2)
     self.win = Context_Window('string', 'positions', 'win_start',
                               'win_end')
     self.win.string = 'Little Alina Zakharova is a linguist student)))'
     self.win.positions = [Position_Plus(0, 23, 25)]
     self.win.win_start = 7
     self.win.win_end = 36
     self.assertEqual(result.string, self.win.string)
     self.assertEqual(result.positions, self.win.positions)
     self.assertEqual(result.win_start, self.win.win_start)
     self.assertEqual(result.win_end, self.win.win_end)
     self.assertEqual(result, self.win)
     os.remove('test_window_two.txt')
 def test_united_window(self):
     self.indexator = Indexer('database')
     test_file_one = open('test_united_window.txt', 'w')
     test_file_one.write('The girl named Alina Zakharova is a student')
     test_file_one.close()
     self.indexator.get_index_with_line('test_united_window.txt')
     del self.indexator
     self.search = SearchEngine('database')
     window_A = windows.Context_Window.get_window('test_united_window.txt',
                                                  Position_Plus(0, 4, 20),
                                                  1)
     window_B = windows.Context_Window.get_window('test_united_window.txt',
                                                  Position_Plus(0, 9, 30),
                                                  1)
     window_A.get_united_window(window_B)
     self.win = windows.Context_Window(
         'The girl named Alina Zakharova is a student',
         [Position_Plus(0, 4, 20),
          Position_Plus(0, 9, 30)], 9, 20)
     self.assertEqual(window_A.string, self.win.string)
     self.assertEqual(window_A.win_start, self.win.win_start)
     self.assertEqual(window_A.win_end, self.win.win_end)
     os.remove('test_united_window.txt')
class BookInventory(object):

    _BOOK_META_ID_INDEX = 0
    _BOOK_META_TITLE_INDEX = 1  #question
    _BOOK_META_AUTHOR_INDEX = 2  #answer
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()

    @timed
    def load_books(self):
        processor = BookDataPreprocessor()
        with open(self.filename) as catalog:
            for entry in catalog:
                book_desc = processor.preprocess(entry)
                metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:])

                iid = book_desc[self._BOOK_META_ID_INDEX].strip()
                title = book_desc[self._BOOK_META_TITLE_INDEX].strip()
                author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip()

                book = Book(iid, title, author, metadata)
                self.engine.add_object(book)

        self.engine.start()

    @timed
    def search_books(self, query, n_results=10):

        result = ''
        if len(query) > 0:
            result = self.engine.search(query, n_results)

        if len(result) > 0:
            return '\n'.join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE

    def books_count(self):
        return self.engine.count()
Esempio n. 18
0
def anytime_gbfs(initial_state, heur_fn, timebound=10):
    # IMPLEMENT
    """Provides an implementation of anytime greedy best-first search, as described in the HW1 handout"""
    """INPUT: a rush hour state that represents the start state and a timebound (number of seconds)"""
    """OUTPUT: A goal state (if a goal is found), else False"""
    """implementation of anytime greedybfs algorithm"""
    search_engine = SearchEngine("best_first", "full")
    search_engine.init_search(initial_state, rushhour_goal_fn, heur_fn)

    gval_cost_bound = float("inf")
    time_left = timebound

    init_time = os.times()[0]
    solution = search_engine.search(
        timebound=time_left, costbound=(gval_cost_bound, float("inf"), float("inf"))
    )
    finish_time = os.times()[0]

    time_left -= finish_time - init_time

    if solution:
        gval_cost_bound = solution.gval
    else:
        return False

    while time_left > 0:
        init_time = os.times()[0]
        improved_solution = search_engine.search(
            timebound=time_left, costbound=(gval_cost_bound, float("inf"), float("inf"))
        )
        time_left -= os.times()[0] - init_time
        if improved_solution:
            gval_cost_bound = improved_solution.gval
            solution = improved_solution
        else:
            break

    return solution
Esempio n. 19
0
def anytime_weighted_astar(initial_state, heur_fn, weight=1.0, timebound=10):
    # IMPLEMENT
    """Provides an implementation of anytime weighted a-star, as described in the HW1 handout"""
    """INPUT: a rush hour state that represents the start state and a timebound (number of seconds)"""
    """OUTPUT: A goal state (if a goal is found), else False"""
    """implementation of weighted astar algorithm"""
    time_left = timebound
    wrapped_fval_function = lambda sN: fval_function(sN, weight)
    se = SearchEngine("custom", "full")
    se.init_search(initial_state, rushhour_goal_fn, heur_fn, wrapped_fval_function)
    cost_bound = float("inf")
    init_time = os.times()[0]
    solution = se.search(
        timebound=time_left, costbound=(float("inf"), float("inf"), cost_bound)
    )
    finish_time = os.times()[0]
    time_left -= finish_time - init_time

    if solution:
        cost_bound = solution.gval + heur_fn(solution)
    else:
        return False

    while time_left > 0:
        init_time = os.times()[0]
        improved_solution = se.search(
            timebound=time_left, costbound=(float("inf"), float("inf"), cost_bound)
        )
        finish_time = os.times()[0]
        time_left -= finish_time - init_time
        if improved_solution:
            cost_bound = improved_solution.gval + heur_fn(improved_solution)
            solution = improved_solution
        else:
            break

    return solution
Esempio n. 20
0
 def do_POST(self):
     '''
     This function sends search results
     '''
     # TIME HAS STARTED
     start_time = time.time()
     form = FieldStorage(fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD':'POST'})
     query = str(form.getvalue('query'))
     limit = form.getvalue("limit")
     if not limit:
         limit = 3
     else:
         limit = int(limit)
     offset = form.getvalue("offset")
     if not offset or int(offset) < 0:
         offset = 0
     else:
         offset = int(offset)
     doc_act = form.getvalue("action")    
     if doc_act == "back" and offset != 0:
         offset = offset - limit   
     elif doc_act  == "forward":
         offset = offset + limit
     elif doc_act == "to the beginning":
         offset = 0
     # field, button and query
     self.send_response(250)
     self.send_header("Content-type", "text/html; charset=utf-8")
     self.end_headers()
     self.wfile.write(bytes("""
             <html>
                 <body>
                     <form method="post">
                         <input type="text" name="query" value="%s"/>
                         <input type="submit" name="search"  value="Search"/>
                         <br>
                         <br>
                         <label for="limit">
                         Docs per page
                         <input type="number" name="limit" placeholder="limit" value="%d"/>
                         </label>
                         <input type="hidden" name="offset" placeholder="offset"value="%d"/>
             """ % (query, limit, offset), encoding="utf-8"))
     # my list of (doclim,docset) pairs
     docs_list = []
     for num in range(limit+1):
         # print('I am reading offsets and limits for quotes')
         quote_act = form.getvalue("action%d" % num)
         doclim = form.getvalue('doc%dlim' % num)
         # print(doclim, 'doclim')
         docset = form.getvalue('doc%dset' % num)
         # print(docset,'docset')
         if not doclim  or doclim == "None":
             doclim = 3
         else:
             doclim = int(doclim)
         if not docset or docset == "None":
             docset = 0
         else:
             docset = int(docset)
         if docset < 0:
             docset = 0
         # print('I am reading conditions for quote buttons')
         # надо чтобы при перелистывании на другую страницу,
         # оффсет и лимит для цитат сбрасывались до базовых
         # если я на 1 стр перелистнула цитаты,
         # то кнопка "назад" для цитат стала активной
         # но когда я перелистываю на 2 стр я хочу,
         # чтобы кнопка назад для цитат с новой стр,
         # которые я еще НЕ листала, была НЕ активной
         if doc_act  == "forward":
             docset = 0
         if quote_act == "back" and docset != 0:
             docset = docset - doclim
             # print(docset, 'docset for back')
         elif quote_act == "forward":
             docset = docset + doclim
             # print(docset, 'docset for forward')
         elif quote_act == "to the beginning":
             docset = 0
         # print(docset,'docset as it is')
         # print(doclim, 'doclim as it is')
         # я добавляю к лимиту единицу, это чтобы листать цитаты
         # (если есть еще одна после лимита, то можно листать, иначе - кнопка не горит!!! и врут календари)))    
         docs_list.append((doclim+1,docset))   
         num += 1
     print(docs_list,'docs_list')
     my_search = SearchEngine('TolstoyDataBase')
     # print(query)
     # еще одна пара, чтобы искать следущий документ
     doc_limof = []
     for pair in docs_list:
         doc_limof.append(pair)
     doc_limof.append((3,0))    
     print(doc_limof,'doc_limof')
     # здесь лимит по цитатам + 1
     final = my_search.qulim_search_modified(query, 1, limit+1, offset, doc_limof)
     # условия горения кнопок по документам
     print(offset, 'offset')
     if offset == 0:
         self.wfile.write(bytes(""" <input type="submit" name="action"  value="to the beginning" disabled/>
                                    <input type="submit" name="action"  value="back"disabled/>""", encoding="UTF-8"))
     else:
         self.wfile.write(bytes(""" <input type="submit" name="action"  value="to the beginning"/>
                                    <input type="submit" name="action"  value="back"/>""", encoding="UTF-8"))
     print(len(final), 'len of final')    
     if len(final.keys()) < limit +1:
         self.wfile.write(bytes(""" <input type="submit" name="action"  value="forward" disabled/>""", encoding="UTF-8"))
     else:
         self.wfile.write(bytes(""" <input type="submit" name="action"  value="forward"/>""", encoding="UTF-8"))
     # the beginning of ordered list
     self.wfile.write(bytes('<ol>', encoding="utf-8")) 
     if not final:
         self.wfile.write(bytes('NOT FOUND, SORRY', encoding="utf-8"))
     # делаю срез, чтобы взять лимит минус 1 результатов, лимит+1 результат не надо показывать, он в уме
     for number,filename in enumerate(sorted(final)[:limit]):
         # create limit and offset for each document for it to have it's personal ones
         quote_lim = doc_limof[number][0]
         quote_offset = doc_limof[number][1]
         self.wfile.write(bytes('<li><p>%s</p>' % filename, encoding ="utf-8"))
         self.wfile.write(bytes("""
                                   <label for="doc%dlim"> Quotes per doc
                                   <input type="number" name="doc%dlim"  value="%d"/>
                                   </label>
                                   <input type="hidden" name="doc%dset"  value="%d"/>
                               """% (number, number, quote_lim-1, number, quote_offset), encoding="utf-8"))
         
         # условия горения кнопок по цитатам
         print(quote_offset,'quote_offset')
         if quote_offset == 0:
             self.wfile.write(bytes(""" <input type="submit" name="action%d"  value="to the beginning"disabled/>
                                    <input type="submit" name="action%d"  value="back"disabled/>""" %(number,number), encoding="UTF-8"))
         else:
             self.wfile.write(bytes(""" <input type="submit" name="action%d"  value="to the beginning"/>
                                    <input type="submit" name="action%d"  value="back"/>""" %(number,number), encoding="UTF-8"))
         print(len(final[filename]),'len(final[filename])')
         print(quote_lim, 'quote_lim')
         print(limit,'limit')
         print(offset,'offset')
         # quote_lim на самом деле уже содержит +1, поэтому еще раз прибавлять не надо
         if len(final[filename]) < quote_lim:
             self.wfile.write(bytes(""" <input type="submit" name="action%d"  value="forward"disabled/>""" %number, encoding="UTF-8"))
         elif len(final[filename]) >= quote_lim:
             self.wfile.write(bytes(""" <input type="submit" name="action%d"  value="forward"/>""" %number, encoding="UTF-8"))    
         # the beginning of unordered list
         self.wfile.write(bytes('<ul>', encoding="utf-8"))
         # вывожу цитаты до лимита по цитатам - 1
         for num, quote in enumerate (final[filename][:-1]):
             self.wfile.write(bytes('<li><p>%s</p></li>' % quote, encoding="utf-8"))
         self.wfile.write(bytes('</ul>', encoding="utf-8"))
     self.wfile.write(bytes("""</ol</form></body></html>""", encoding="utf-8"))
     print('time:', time.time() - start_time)
 def setUp(self):
     """
     Setup search engine that will be subjected to the tests.
     """
     self.engine = SearchEngine()
Esempio n. 22
0
class BookInventory(object):
    """Class representing a inventory of books.

    Args:
      filename (str): File name containing book inventory data.

    Attributes:
      filename (str): File name containing book inventory data.
      indexer (Indexer): Object responsible for indexing book inventory data.

    """

    _BOOK_META_ID_INDEX = 0
    _BOOK_META_TITLE_INDEX = 1
    _BOOK_META_AUTHOR_INDEX = 2
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()

    @timed
    def load_books(self):
        """Load books from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        logger.info('Loading books from file...')
        processor = BookDataPreprocessor()
        with open(self.filename) as catalog:
            for entry in catalog:
                book_desc = processor.preprocess(entry)
                metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:])

                iid = book_desc[self._BOOK_META_ID_INDEX].strip()
                title = book_desc[self._BOOK_META_TITLE_INDEX].strip()
                author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip()

                book = Book(iid, title, author, metadata)
                self.engine.add_object(book)

        self.engine.start()

    @timed
    def search_books(self, query, n_results=10):
        """Search books according to provided query of terms.

        The query is executed against the indexed books, and a list of books
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing books and their respective
            tf-idf scores.

        """
        result = ''
        if len(query) > 0:
            result = self.engine.search(query, n_results)

        if len(result) > 0:
            return '\n'.join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE

    def books_count(self):
        """Return number of books already in the index.

        Returns:
          int: Number of books indexed.

        """
        return self.engine.count()
Esempio n. 23
0
 def __init__(self, filename):
     self.filename = filename
     self.engine = SearchEngine()
class SearchEngineTests(unittest.TestCase):
    """
    Test case for SearchEngine class.
    """

    def setUp(self):
        """
        Setup search engine that will be subjected to the tests.
        """
        self.engine = SearchEngine()

    def test_indexed_doc_count(self):
        """
        Test if the number of indexed object is retrieved correctly.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])
        self.assertEqual(self.engine.count(), 3)

    def test_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
            IndexableResult(0.906589, sample2),
            IndexableResult(0.906589, sample3),
        ]

        results = self.engine.search("indexable metadata")
        self.assertListEqual(results, expected_results)

    def test_non_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = []

        results = self.engine.search("asdasdasdas")
        self.assertListEqual(results, expected_results)

    def test_search_result_limit(self):
        """
        Test if search results can be limited.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
        ]

        results = self.engine.search("indexable metadata", 1)
        self.assertListEqual(results, expected_results)

    def build_sample_index(self, objects):
        for indexable in objects:
            self.engine.add_object(indexable)
        self.engine.start()
Esempio n. 25
0
class BookInventory(object):
    """Class representing a inventory of books.

    Args:
      filename (str): File name containing book inventory data.

    Attributes:
      filename (str): File name containing book inventory data.
      indexer (Indexer): Object responsible for indexing book inventory data.

    """

    _BOOK_META_ID_INDEX = 0
    _BOOK_META_TITLE_INDEX = 1
    _BOOK_META_AUTHOR_INDEX = 2
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()

    @timed
    def load_books(self):
        """Load books from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        logger.info('Loading books from file...')
        processor = BookDataPreprocessor()
        with open(self.filename) as catalog:
            for entry in catalog:
                book_desc = processor.preprocess(entry)
                metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:])

                iid = book_desc[self._BOOK_META_ID_INDEX].strip()
                title = book_desc[self._BOOK_META_TITLE_INDEX].strip()
                author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip()

                book = Book(iid, title, author, metadata)
                self.engine.add_object(book)

        self.engine.start()

    @timed
    def search_books(self, query, n_results=10):
        """Search books according to provided query of terms.

        The query is executed against the indexed books, and a list of books
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing books and their respective
            tf-idf scores.

        """
        result = ''
        if len(query) > 0:
            result = self.engine.search(query, n_results)

        if len(result) > 0:
            return '\n'.join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE

    def books_count(self):
        """Return number of books already in the index.

        Returns:
          int: Number of books indexed.

        """
        return self.engine.count()
Esempio n. 26
0
class wordInventory(object):
    """Class representing a inventory of words.

    Args:
      filename (str): File name containing word inventory data.

    Attributes:
      filename (str): File name containing word inventory data.
      engine (SearchEngine): Object responsible for indexing word inventory data.

    """
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()
        # self.engine2 = SearchEngine()

    @timed
    def init_engine(self, isFromFile=True, isBinaryWord=False):
        """Load words from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        # print isFromFile
        if isFromFile:
            self.loadFromeFile(isBinaryWord)
        else:
            logger.info('Loading words from file...')
            iid = 1
            for parent, dirnames, fnames in os.walk(self.filename):
                for fname in fnames:
                    fname2 = './Reuters/' + fname
                    # print fname
                    word = open(fname2).read()
                    # temp = fname.rstrip('.html').split('-')
                    # if len(temp)<=1:
                    # continue
                    # singer = temp[0]
                    # title = temp[1]
                    # metadata = singer + ' ' + title

                    # wordobject = Word(iid, title, singer,word)
                    wordobject = Word(iid, word, isBinaryWord)
                    # songobject  = SongInfo(iid,title,singer,metadata)
                    self.engine.add_object(wordobject)
                    # self.engine2.add_object(songobject)
                    iid += 1

            self.engine.start(isBinaryWord)
            # self.engine2.start()
            self.saveToFile(isBinaryWord)

    @timed
    def search_words(self, query, n_results=10, choice=2, SYSNONYM=False):
        """Search words according to provided query of terms.

        The query is executed against the indexed words, and a list of words
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing words and their respective
            tf-idf scores.

        """
        result = ''
        # dictionary = self.engine.index.term_index.keys()
        if len(query) > 0:
            # checkSpelling(query, dictionary)
            parent, dirnames, fnames = list(os.walk(self.filename))[0]
            if choice == 1:
                result = self.engine.search_bool(query, n_results, SYSNONYM)
                for res in result:
                    print res, " ", fnames[res]
            elif choice == 2:
                result = self.engine.search(query, n_results, SYSNONYM)
                for res in result:
                    print res.indexable.iid - 1, " ", fnames[res.indexable.iid
                                                             -
                                                             1], " ", res.score
            # print len(list(os.walk(self.filename)))
            # print

        if len(result) > 0:
            # return '\n'.join([str(indexable) for indexable in result])
            return
        return self._NO_RESULTS_MESSAGE

    # def search_info(self, query, n_results=10):
    #     """Search song information according to provided query of terms.

    #     The query is executed against the indexed words, and a list of words
    #     compatible with the provided terms is return along with their tf-idf
    #     score.

    #     Args:
    #       query (str): Query string with one or more terms.
    #       n_results (int): Desired number of results.

    #     Returns:
    #       list of IndexableResult: List containing words and their respective
    #         tf-idf scores.

    #     """
    #     result = ''
    #     if len(query) > 0:
    #         result = self.engine2.search(query, n_results)

    #     if len(result) > 0:
    #         return '\n'.join([str(indexable) for indexable in result])
    #     return self._NO_RESULTS_MESSAGE

    def saveToFile(self, isBinaryWord):
        if isBinaryWord:
            fileObject = open('test.engine', 'w')
        else:
            fileObject = open('test_noBinary.engine', 'w')
        pickle.dump(self.engine, fileObject)

    # @timed
    def loadFromeFile(self, isBinaryWord=False):
        # print isBinaryWord
        if isBinaryWord:
            fileObject = open('test.engine', 'r')
        else:
            fileObject = open('test_noBinary.engine', 'r')
        self.engine = pickle.load(fileObject)

    def words_count(self):
        """
        Returns:
          int: Number of words indexed.
        """
        return self.engine.count()
Esempio n. 27
0
class FrontierManager():
    """
    Frontier Manager
    
    seeds in request form
    """
    # seeds to start crawling
    seeds = []
    # links to crawl
    links = []
    # links crawled
    links_done = []

    # /! Will have to go in a Frontera Middleware at some point
    # weighted links to crawl
    weighted_links = None
    # weighted links crawled
    weighted_links_done = []
    # weighted
    ignore_seeds = []
    # weighted
    ignored_pages = []

    requests = []
    requests_done = []

    max_n_requests = 10

    searchengine = None

    crawl_book = None

    url_base = ''

    # /! def __init__(self, settings=SETTINGS, seeds=SETTINGS['SEEDS']):
    def __init__(self, seeds=[]):
        """ init with seeds
        
        Init with seeds
        Create/Open a file for storing progress
        """
        #self.settings = settings
        url = next(iter(SEEDS))
        parsed_uri = urlparse(url)
        self.url_base = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

        self.searchengine = SearchEngine()
        self.searchengine.db_connect()

        self.crawl_book = CrawlWorkbook(path=WWW_DIR, url=seeds[0].url)
        self.crawl_book.wb_open()

        # /! Will have to go in a Frontera Middleware at some point
        # retrieve weighted_links, weighted_links_done...
        self.weighted_links = self.crawl_book.weighted_links
        self.weighted_links_done = self.crawl_book.weighted_links_done
        self.ignore_seeds = self.crawl_book.ignore_seeds
        self.ignored_pages = self.crawl_book.ignored_pages

        self.add_seeds(seeds)

        # build requests from weighted_links
        for wl in self.weighted_links:
            self.requests.append(requests.Request(url=wl.url))
        for wl in self.weighted_links_done:
            self.requests_done.append(requests.Request(url=wl.url))

        # ignore list
        ignore_suffixes = [
            '/es/', '/fr/', '/ca/', '/newsletters', '/2021/', '/2020/01/',
            '/2020/02/', '/2020/03/', '/2020/04/', '/2020/05/', '/2020/06/',
            '/2020/07/', '/2020/08/', '/2020/09/', '/2020/10/', '/2019/',
            '/2018/', '/2017/', '/2016/', '/2015/', '/2014/', '/section/world',
            '/video/world', '/section/food', '/section/arts',
            '/section/sports', '/section/science', '/section/books',
            '/section/travel', '/section/realestate', '/section/fashion',
            '/section/technology', '/section/politics', '/section/business',
            '/section/style', '/section/well', '/section/style/love',
            '/section/us', '/section/video', '/section/interactive',
            '/section/magazine', '/international', '/section/t-magazine',
            '/section/live', '/live', '/video', '/interactive',
            '/issue/fashion', '/subscription', '/subscriptions',
            '/section/business/dealbook', '/pages/business/dealbook',
            '/privacy'
        ]
        if not self.ignore_seeds:
            self.ignore_seeds = [
                WeightedLink(url=urljoin(self.url_base, suffix))
                for suffix in ignore_suffixes
            ]
            self.crawl_book.ws_writerows(
                WORKBOOK['crawler']['worksheet']['ignoreseeds']['TITLE'],
                self.ignore_seeds)

    def add_seeds(self, seeds):
        """
        add seeds
        
        /! not append
        """
        self.seeds = seeds
        if self.weighted_links is None:
            self.weighted_links = [
                WeightedLink(url=seed.url) for seed in self.seeds
            ]
        if self.weighted_links is not None and len(self.weighted_links) == 0:
            self.weighted_links = [
                WeightedLink(url=seed.url) for seed in self.seeds
            ]

    def request_error(request, error_code):
        """
        TODO
        """
        pass

    def start(self):
        # should open workbook as well
        self.searchengine.db_connect()

    def stop(self):
        # should save workbook, maybe init values with as well
        self.searchengine.db_close()

    def finished(self):
        """
        Quick check if crawling is finished. Called pretty often, please make sure calls are lightweight.
        """
        return not self.weighted_links

    def page_save_to_file(self, request, soup):
        """
        save page to a #.html file where # is the title crc32
        TODO: save request
        """
        # eg. /var/www/html/apple.com/256.html
        url_hash = SearchEngine.hash_url(request.url)
        file_name = os.path.join(HTML_DIR,
                                 urlsplit(request.url).netloc,
                                 str(url_hash) + '.html')
        print(file_name)
        os.makedirs(os.path.join(HTML_DIR,
                                 urlsplit(request.url).netloc),
                    exist_ok=True)
        f = open(file_name, 'w')
        f.write(soup.prettify())
        f.close()

    def page_crawled(self, response):
        """
        This method is called every time a page has been crawled.
        """
        self.requests_done.append(response.request)
        self.requests = [
            req for req in self.requests if req.url != response.request.url
        ]

        html_doc = response.text
        soup = BeautifulSoup(html_doc, 'html.parser')
        title = u''
        if soup is not None:
            titles = soup.find('title')
            if titles is not None:
                title = titles.string

        # extract first weighted link matchinq response.request.url
        wl = next(
            (x for x in self.weighted_links if x.url == response.request.url),
            None)
        if wl:
            self.crawl_book.ws_writeln(
                WORKBOOK['crawler']['worksheet']['crawledpages']['TITLE'], wl)
            self.crawl_book.wb_save()

        # update weighted_links from resquests
        self.weighted_links = [
            wl for wl in self.weighted_links if wl.url != response.request.url
        ]

        self.crawl_book.ws_writerows(
            WORKBOOK['crawler']['worksheet']['tocrawlpages']['TITLE'],
            self.weighted_links)

        self.page_save_to_file(request=response.request, soup=soup)

        print('Frontier: ', len(self.requests), 'pages to crawl -',
              len(self.requests_done), 'crawled pages -',
              len(self.ignored_pages), 'ignored pages')

    def get_next_requests(self, max_n_requests=MAX_N_REQUESTS):
        """
        Returns a list of next urls to be crawled.
        Parameters:	

        max_next_requests (int) – Maximum number of urls to be returned by this method.

        Returns:	

        list of weighted links.
        """
        # return first max_n_requests links
        return self.requests[:max_n_requests]
        #return self.weighted_links[:max_n_requests]

    def in_ignore_seeds(self, link):
        """
        returns True if link (request) is in self.ignore_seeds
        """
        return next(
            (x for x in self.ignore_seeds if link.url.startswith(x.url)), None)

    def in_ignored_pages(self, link):
        """
        returns True if link (request) is in self.ignored_pages
        """
        return next((x for x in self.ignored_pages if x.url == link.url), None)

    def links_extracted(self, request, links):
        """
        add links to crawl found in response (from request)
        """
        print('Frontier: links_extracted')
        for req in links:
            already_there = False
            if self.in_ignore_seeds(req):
                if not self.in_ignored_pages(req):
                    self.ignored_pages.append(WeightedLink(url=req.url))
            else:
                # extract first request matchinq request.url
                inreqs = next((x for x in self.requests if x.url == req.url),
                              None)
                if not inreqs:
                    # extract first request matchinq request.url
                    inreqsdone = next(
                        (x for x in self.requests_done if x.url == req.url),
                        None)
                    if not inreqsdone:
                        self.requests.append(req)
                        self.weighted_links.append(WeightedLink(url=req.url))

        wbwsname = WORKBOOK['crawler']['worksheet']['tocrawlpages']['TITLE']
        self.crawl_book.ws_writerows(wbwsname, self.weighted_links)
        wbwsname = WORKBOOK['crawler']['worksheet']['ignoredpages']['TITLE']
        self.crawl_book.ws_writerows(wbwsname, self.ignored_pages)
Esempio n. 28
0
from flask import Flask, render_template, request
from flask_bootstrap import Bootstrap
from data_source import DataSource
from search import SearchEngine
import urllib.parse

app = Flask(__name__)
bootstrap = Bootstrap(app)

ds = DataSource()
se = SearchEngine(ds)


@app.route('/', methods=['GET'])
def index():
    return render_template('index_jinja.html')


@app.route('/search', methods=['GET'])
def search():

    # get query
    query = request.args.get('query')

    if query is None:
        query = ''

    query = urllib.parse.unquote(query)

    # search result
    search_result = se.search(query)
Esempio n. 29
0
from recipe import Recipe, load_recipe_from_json
from pathlib import Path
from recipe_index import RecipeIndex
from search import SearchEngine
from suggestions import SuggestionEngine
import traceback

app = Flask(__name__, static_url_path=None)

# TODO How can we make constants configurable?
REACT_BUILD_DIR = 'react-js/build'
DEFAULT_IMAGE_DIR = f'{str(Path.home())}/data/recipe-data/images'
DEFAULT_CONFIG_DIR = f'{str(Path.home())}/data/recipe-data/config'

INDEX = RecipeIndex(DEFAULT_CONFIG_DIR)
SEARCHER = SearchEngine(INDEX)
SUGGESTER = SuggestionEngine(INDEX)


@app.route('/', defaults={'path': 'index.html'})
@app.route('/<path:path>')
def catch_all(path):
    return send_from_directory(REACT_BUILD_DIR, path)


@app.route('/recipe/<id>')
def render_recipe(id):
    return send_from_directory(REACT_BUILD_DIR, 'index.html')


@app.route('/images/<image_name>')
Esempio n. 30
0
    def run(self):
        self.m_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.m_socket.bind((self.m_ip, self.m_port))

        while True:
            self.m_socket.listen(3)
            connect, address = self.m_socket.accept()
            data = connect.recv(1024)
            request_lists = self.handle_http_request(data)
            self.handle_http_request(data)
            source = '/index.html'
            if request_lists[0][1] != '/':
                source = request_lists[0][1]
                if '.html' in source:
                    self.content += 'Content-Type: text/html\r\n\r\n'
                elif '.css' in source:
                    self.content += 'Content-Type: text/css\r\n\r\n'
                elif '.js' in source:
                    self.content += 'Content-Type: text/js\r\n\r\n'
                elif '.jpg' in source:
                    self.content += 'Content-Type: image/jpg\r\n\r\n'
                elif '.png' in source:
                    self.content += 'Content-Type: image/png\r\n\r\n'
                else:
                    self.content += 'Content-Type: text/html\r\n\r\n'
            else:
                self.content += 'Content-Type: text/html\r\n\r\n'
                source = '/index.html'
            print source

            string = ""
            if "?" in source:
                key_words = source[4:]
                print key_words
                key_words = key_words.replace("%", ' ')
                key_words = key_words.split(" ")
                key_words = key_words[1:]
                print key_words
                k = "".join(map(lambda x: chr(int(x, 16)), key_words))
                print k
                a = SearchEngine('F:\search_engine\json_data\\').run(k)
                url = "http://job.cqupt.edu.cn/#rec:"
                print a

                for i in a:
                    string += "<p>" + "<a href=" + url + i[0][:3] + ">" + i[
                        0][:3] + "</a>" + "</p>"
                string = '<html><title>result</title>' + string + "</html>"
                source = '/index.html'

            try:
                print os.getcwd() + '/www' + source
                fp = open(os.getcwd() + '/www' + source, 'r')
                f = fp.read()
                fp.close()
                if len(string) > 1:

                    f = string

                connect.sendall(self.content + f)
            except:
                print "not found"
                fp = open(os.getcwd() + '/www' + '/404.html', 'r')
                f = fp.read()
                fp.close()
                if len(string) > 1:

                    f = string

                connect.sendall(self.content + f)
            self.content = 'HTTP/1.x 200 OK\r\n'
            connect.close()
Esempio n. 31
0
from django.shortcuts import render_to_response
from django.http import HttpResponse
from search import SearchEngine
import json

# Create your views here.

g_Se = SearchEngine()


def home(request):
    return render_to_response("index.html")


def search(request):
    keyword = request.GET.get('keyword', '')
    tweets = g_Se.search(keyword)
    return HttpResponse(json.dumps({"keyword": keyword, "tweets": tweets}))


def search_range(request):
    lat = request.GET.get('lat', '')
    lon = request.GET.get('lon', '')
    tweets = g_Se.search_range(float(lat), float(lon))
    return HttpResponse(json.dumps({"tweets": tweets}))
 def setUp(self):
     """
     Setup search engine that will be subjected to the tests.
     """
     self.engine = SearchEngine()
class wordInventory(object):
    """Class representing a inventory of words.

    Args:
      filename (str): File name containing word inventory data.

    Attributes:
      filename (str): File name containing word inventory data.
      engine (SearchEngine): Object responsible for indexing word inventory data.

    """
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()
        self.engine2 = SearchEngine()

    @timed
    def load_words(self):
        """Load words from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        logger.info('Loading words from file...')
        iid =  1
        for parent,dirnames,fnames in os.walk(self.filename):
                for fname in fnames:
                    fname2 = './Reuters/' + fname
                    # print fname
                    word = open(fname2).read()
                    # temp = fname.rstrip('.html').split('-')
                    # if len(temp)<=1:
                        # continue
                    # singer = temp[0]
                    # title = temp[1]
                    # metadata = singer + ' ' + title

                    # wordobject = Word(iid, title, singer,word)
                    wordobject = Word(iid, word)
                    # songobject  = SongInfo(iid,title,singer,metadata)
                    self.engine.add_object(wordobject)
                    # self.engine2.add_object(songobject)
                    iid+=1

        self.engine.start()
        # self.engine2.start()
        self.saveToFile()

    @timed
    def search_words(self, query, n_results=10):
        """Search words according to provided query of terms.

        The query is executed against the indexed words, and a list of words
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing words and their respective
            tf-idf scores.

        """
        result = ''
        # dictionary = self.engine.index.term_index.keys()
        if len(query) > 0:
            # checkSpelling(query, dictionary)
            result = self.engine.search(query, n_results)
            print result

        if len(result) > 0:
            # return '\n'.join([str(indexable) for indexable in result])
            return
        return self._NO_RESULTS_MESSAGE

    # def search_info(self, query, n_results=10):
    #     """Search song information according to provided query of terms.

    #     The query is executed against the indexed words, and a list of words
    #     compatible with the provided terms is return along with their tf-idf
    #     score.

    #     Args:
    #       query (str): Query string with one or more terms.
    #       n_results (int): Desired number of results.

    #     Returns:
    #       list of IndexableResult: List containing words and their respective
    #         tf-idf scores.

    #     """
    #     result = ''
    #     if len(query) > 0:
    #         result = self.engine2.search(query, n_results)

    #     if len(result) > 0:
    #         return '\n'.join([str(indexable) for indexable in result])
    #     return self._NO_RESULTS_MESSAGE


    def saveToFile(self):
        fileObject = open('test.engine','w')
        pickle.dump(self.engine, fileObject)



    def words_count(self):
        """
        Returns:
          int: Number of words indexed.
        """
        return self.engine.count()
 def __init__(self, filename):
     self.filename = filename
     self.engine = SearchEngine()
     self.engine2 = SearchEngine()
class SearchEngineTests(unittest.TestCase):
    """
    Test case for SearchEngine class.
    """
    def setUp(self):
        """
        Setup search engine that will be subjected to the tests.
        """
        self.engine = SearchEngine()

    def test_indexed_doc_count(self):
        """
        Test if the number of indexed object is retrieved correctly.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])
        self.assertEqual(self.engine.count(), 3)

    def test_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
            IndexableResult(0.906589, sample2),
            IndexableResult(0.906589, sample3),
        ]

        results = self.engine.search('indexable metadata')
        self.assertListEqual(results, expected_results)

    def test_non_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = []

        results = self.engine.search('asdasdasdas')
        self.assertListEqual(results, expected_results)

    def test_search_result_limit(self):
        """
        Test if search results can be limited.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
        ]

        results = self.engine.search('indexable metadata', 1)
        self.assertListEqual(results, expected_results)

    def build_sample_index(self, objects):
        for indexable in objects:
            self.engine.add_object(indexable)
        self.engine.start()