コード例 #1
0
def busquedaPorAsuntoCuerpo(aBuscar):        
    ix = open_dir("index")
    qp = MultifieldParser(["asunto","cuerpo"], schema=ix.schema)
    q = qp.parse(unicode(str(aBuscar)))
    s=ix.searcher()
    results = s.search(q)
    return results
コード例 #2
0
    def search(self, query):
        qp = MultifieldParser(['title', 'content'], schema=self.idx.schema)
        q = qp.parse(query)

        query_result = self.trainModel.predict(query)
        query_label = query_result['outputLabel']
        query_output_list = query_result['outputMulti']
        query_score = float()

        for output in query_output_list:
            if output['label'] == query_label:
                query_score = float(output['score'])

        #print("Query result : " + query_label + "\nQuery score : " + str(query_score))

        with self.idx.searcher() as searcher:
            search_results = searcher.search(q, limit=None)
            article_ids = self.find_matching_political_bias\
                            (search_results, query_label, query_score)

        for article_id in article_ids:
            #print(article_id)
            print(self.get_article_url(article_id))
            print(self.get_article_title(article_id))
            print(self.get_article_snippet(article_id))
            print(self.get_article_date(article_id))
コード例 #3
0
    def search(self, str):
        ix = open_dir(indexdir)

        with ix.searcher() as searcher:
            parser = MultifieldParser(['keywords', 'content', 'title'],
                                      ix.schema)
            query = parser.parse(str)

            results = searcher.search(query)

            return [list(result.values()) for result in results]
コード例 #4
0
def apartado_a(palabras):
    ix = open_dir("Index")
    queryContent = []
    for palabra in palabras:
        queryContent.append(Term(""))
    query = palabras
    with ix.searcher() as searcher:
        parser = MultifieldParser(["titulo", "sinopsis"],
                                  ix.schema,
                                  group=qparser.OrGroup)
        query = parser.parse(query)
        print(query)
        results = searcher.search(query)
        for r in results:
            print(r)
コード例 #5
0
ファイル: main.py プロジェクト: adamyi/Geegle3
def search():
    start_time = time.time()
    form = request.form
    qstr = request.args.get('q')
    page = int(request.args.get('p', "1"))
    parser = MultifieldParser(['title', 'content'],
                              schema=ix.schema,
                              group=OrGroup)
    #termclass=MyFuzzyTerm)
    query = parser.parse(qstr)
    notes = []
    with ix.searcher() as searcher:
        corrected = searcher.correct_query(query, qstr)
        results = searcher.search(query)
        rel = results.estimated_length()
        if corrected.string.lower() != qstr.lower():
            crs = searcher.search(corrected.query)
            if crs.estimated_length() > rel:
                notes.append("Did you mean: " + corrected.string)
        results = searcher.search_page(query, page, terms=True)
        my_cf = ContextFragmenter(maxchars=20, surround=30, charlimit=256)
        results.order = SCORE
        results.fragmenter = my_cf
        # results.formatter = HtmlFormatter()
        rsp = [{
            "url": item["url"],
            "content": item.highlights("content"),
            "title": item["title"]
        } for item in results]
    # return json.dumps(rsp)
    # print(json.dumps(rsp))
    if rel == 0:
        notes.append("Sorry, no result for your query")
    else:
        elapsed_time = time.time() - start_time
        notes.append("%d results found in %.2f seconds" % (rel, elapsed_time))
    return render_template("result.html",
                           result=rsp,
                           query=qstr,
                           notes=notes,
                           nextpage=page + 1,
                           urlquery=urllib.quote_plus(qstr))
コード例 #6
0
def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {
        'content': 1.0,
        'title': 3.0
    }
    query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter),
            'url': search_result['url'],
            'title': search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count
コード例 #7
0
ファイル: search.py プロジェクト: okute/ProfessionalWebsite
def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {'content': 1.0, 'title': 3.0}
    query_parser = MultifieldParser(['content', 'title'],
                                    schema=index_schema,
                                    fieldboosts=field_boosts,
                                    group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content':
            highlight(search_result['content'], search_terms, analyzer,
                      fragmenter, formatter),
            'url':
            search_result['url'],
            'title':
            search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count
コード例 #8
0
ファイル: views.py プロジェクト: stikkas/rfss
def page_detail(request, pk):
    page = Page.objects.get(pk=pk)
    menu = page.menu
    comments = Comment.objects.filter(page=page).filter(
        visible=True).select_related('user')

    if menu.get_children():
        menu = menu.get_children()[0]

    menus = {'level2': [], 'level3': []}
    if menu.level == 2:
        for m in menu.get_siblings(include_self=True):
            if m == menu:
                m.active = True
            menus['level2'].append(m)
        menus['level3'] = menu.get_children()
    elif menu.level == 3:
        for m in menu.parent.get_siblings(include_self=True):
            if m == menu.parent:
                m.active = True
            menus['level2'].append(m)

        for m in menu.get_siblings(include_self=True):
            if m == menu:
                m.active = True
            menus['level3'].append(m)

    if page.keywords:
        # Open index dir
        ix = whoosh_index.open_dir(settings.PAGE_SEARCH_INDEX)
        # Make parser
        parser = MultifieldParser(['content', 'name'], schema=ix.schema)
        # Configure filter
        filter = Term('region', page.region.id)
        # Make query string
        qstr = page.keywords.replace('+', ' AND ').replace(' -', ' NOT ').replace(' | ', ' OR ')
        # Parse query string
        query = parser.parse(qstr)
        # And... search in index!
        with ix.searcher() as searcher:
            hits = ix.searcher().search(query, filter=filter, limit=None)
            similar_pages = Page.objects.filter(
                pk__in=[h.get('id') for h in hits if h.get('id') != page.id]
            ).order_by('?')[:5]

    context = {
        'page': page,
        'comments': comments,
        'menus': menus,
        'menu': menu,
        'similar_pages': locals().get('similar_pages')
    }

    # Comments
    if request.method == 'POST' and request.POST.get('submit_comment'):
        comment_form = CommentForm(request.POST)
        if comment_form.is_valid() and request.user.is_authenticated():
            comment_form.add_comment()
            context['comment_success'] = True
            # Clear form
            initial = {'user': request.user.id, 'page': page.id}
            comment_form = CommentForm(initial=initial)
    else:
        initial = {'user': request.user.id, 'page': page.id}
        comment_form = CommentForm(initial=initial)

    try:
        context['star_rating'] = StarRating.objects.get(page=page)
    except StarRating.DoesNotExist:
        context['star_rating'] = StarRating.objects.create(page=page)

    context['comment_form'] = comment_form

    return render(request, 'page_detail.html', context)
コード例 #9
0
def search_whoosh_files(filename_in):
    ngt1 = RegexTokenizer() | NgramFilter(4)
    l_aux_i = 0
    filename_aux = "dataset_match_" + filename_in
    ix1 = open_dir("index_" + filename_in)

    #aux max val for progress bar
    if filename_in == "jrc_person":
        max_val = 3000000
    else:
        max_val = 3000000

    widgets = [
        'Progress Searching ' + filename_in + ': ',
        Percentage(), ' ',
        Bar(marker='0', left='[', right=']'), ' ',
        ETA(), ' '
    ]
    pbar = ProgressBar(widgets=widgets, maxval=max_val)  #454000
    pbar.start()

    with ix1.searcher() as searcher:
        parser = MultifieldParser(['title'], ix1.schema)
        parser.remove_plugin_class(qparser.WildcardPlugin)
        parser.remove_plugin_class(qparser.PlusMinusPlugin)
        with open("dataset_non_match_" + filename_in + ".csv_tmp",
                  'w',
                  encoding="utf-8") as inW2:
            with open("dataset_match_" + filename_in + ".csv",
                      encoding="utf8") as csvfile:
                for row in csvfile:
                    l_aux_i = l_aux_i + 1
                    if l_aux_i % 20000 == 0:
                        print("Index search" + str(l_aux_i))
                        pbar.update(l_aux_i)
                    l_row_idx = row.split('|')[0]
                    l_row_aux = row.split('|')[1]
                    search_list = [token.text for token in ngt1(l_row_aux)]
                    if len(search_list) > 0:
                        l_row_str = random.sample(search_list, 1)
                        query = parser.parse(l_row_str[0])
                        results = searcher.search(query)
                        results_aux = []
                        for result in results:
                            if result['id'] != l_row_idx:
                                results_aux.append(
                                    [result['id'], result['title']])
                        if len(results_aux) > 0:
                            shuffle(results_aux)
                            line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                0][0] + "|" + results_aux[0][1]
                            inW2.write(line_new.strip() + '\n')
                            if len(results_aux) > 1:
                                if results_aux[1][0] != results_aux[0][0]:
                                    line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                        1][0] + "|" + results_aux[1][1]
                                    inW2.write(line_new.strip() + '\n')
                            if len(results_aux) > 2:
                                if results_aux[2][0] != results_aux[1][0]:
                                    line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                        2][0] + "|" + results_aux[2][1]
                                    inW2.write(line_new.strip() + '\n')
        pbar.finish()