コード例 #1
0
def busquedaPorAsuntoCuerpo(aBuscar):        
    ix = open_dir("index")
    qp = MultifieldParser(["asunto","cuerpo"], schema=ix.schema)
    q = qp.parse(unicode(str(aBuscar)))
    s=ix.searcher()
    results = s.search(q)
    return results
コード例 #2
0
ファイル: searcher.py プロジェクト: svankie/clew
class EventSearcher(object):
    def search(self, query):

        data = None
        try:
            q = self.parser.parse(query)
            result = self.searcher.search(q)

            data = [{
                "title": res["title"],
                "description": res["description"],
                "id": res["id"],
                "date": res['date']
            } for res in result]
        finally:
            # better to close on __del__?
            self.searcher.close()

        return data

    def __init__(self):
        index = open_dir("search/index")
        self.searcher = index.searcher()
        self.parser = MultifieldParser(["title", "description", "date"],
                                       index.schema)
        self.parser.add_plugin(DateParserPlugin())
コード例 #3
0
ファイル: searcher.py プロジェクト: svankie/clew
class EventSearcher(object):

    def search(self, query):

        data = None
        try:
            q = self.parser.parse(query)
            result = self.searcher.search(q)

            data = [
                {"title": res["title"],
                 "description": res["description"],
                 "id": res["id"],
                 "date": res['date']
                } for res in result
            ]
        finally:
            # better to close on __del__?
            self.searcher.close()

        return data

    def __init__(self):
        index = open_dir("search/index")
        self.searcher = index.searcher()
        self.parser = MultifieldParser(["title", "description", "date"], index.schema)
        self.parser.add_plugin(DateParserPlugin())
コード例 #4
0
    def search(self, query):
        qp = MultifieldParser(['title', 'content'], schema=self.idx.schema)
        q = qp.parse(query)

        query_result = self.trainModel.predict(query)
        query_label = query_result['outputLabel']
        query_output_list = query_result['outputMulti']
        query_score = float()

        for output in query_output_list:
            if output['label'] == query_label:
                query_score = float(output['score'])

        #print("Query result : " + query_label + "\nQuery score : " + str(query_score))

        with self.idx.searcher() as searcher:
            search_results = searcher.search(q, limit=None)
            article_ids = self.find_matching_political_bias\
                            (search_results, query_label, query_score)

        for article_id in article_ids:
            #print(article_id)
            print(self.get_article_url(article_id))
            print(self.get_article_title(article_id))
            print(self.get_article_snippet(article_id))
            print(self.get_article_date(article_id))
コード例 #5
0
    def search(self, str):
        ix = open_dir(indexdir)

        with ix.searcher() as searcher:
            parser = MultifieldParser(['keywords', 'content', 'title'],
                                      ix.schema)
            query = parser.parse(str)

            results = searcher.search(query)

            return [list(result.values()) for result in results]
コード例 #6
0
    def buildQueryParser(self):

        # Set numerical scoring parameters
        CoordinateDescentScorer.baselineScoreWeight = self.baselineScoreWeight
        CoordinateDescentScorer.pageRankWeight = self.pageRankWeight
        CoordinateDescentScorer.pageRankScalingWeight = self.pageRankScalingWeight

        # Build parser
        keywordsQueryParser = MultifieldParser(['content','title', 'description', 'keywords', 'headers', 'yqlKeywords', 'expandedYqlKeywords'],
                                                    self.indexSchema, fieldboosts=self.weights, group=OrGroup)
        keywordsQueryParser.add_plugin(PlusMinusPlugin)
        
        return keywordsQueryParser
コード例 #7
0
def apartado_a(palabras):
    ix = open_dir("Index")
    queryContent = []
    for palabra in palabras:
        queryContent.append(Term(""))
    query = palabras
    with ix.searcher() as searcher:
        parser = MultifieldParser(["titulo", "sinopsis"],
                                  ix.schema,
                                  group=qparser.OrGroup)
        query = parser.parse(query)
        print(query)
        results = searcher.search(query)
        for r in results:
            print(r)
コード例 #8
0
ファイル: main.py プロジェクト: adamyi/Geegle3
def search():
    start_time = time.time()
    form = request.form
    qstr = request.args.get('q')
    page = int(request.args.get('p', "1"))
    parser = MultifieldParser(['title', 'content'],
                              schema=ix.schema,
                              group=OrGroup)
    #termclass=MyFuzzyTerm)
    query = parser.parse(qstr)
    notes = []
    with ix.searcher() as searcher:
        corrected = searcher.correct_query(query, qstr)
        results = searcher.search(query)
        rel = results.estimated_length()
        if corrected.string.lower() != qstr.lower():
            crs = searcher.search(corrected.query)
            if crs.estimated_length() > rel:
                notes.append("Did you mean: " + corrected.string)
        results = searcher.search_page(query, page, terms=True)
        my_cf = ContextFragmenter(maxchars=20, surround=30, charlimit=256)
        results.order = SCORE
        results.fragmenter = my_cf
        # results.formatter = HtmlFormatter()
        rsp = [{
            "url": item["url"],
            "content": item.highlights("content"),
            "title": item["title"]
        } for item in results]
    # return json.dumps(rsp)
    # print(json.dumps(rsp))
    if rel == 0:
        notes.append("Sorry, no result for your query")
    else:
        elapsed_time = time.time() - start_time
        notes.append("%d results found in %.2f seconds" % (rel, elapsed_time))
    return render_template("result.html",
                           result=rsp,
                           query=qstr,
                           notes=notes,
                           nextpage=page + 1,
                           urlquery=urllib.quote_plus(qstr))
コード例 #9
0
    def search(self, query):
        ix = open_dir("news")
        if "s" in query.keys():
            s = query["s"][0]
        else:
            s = ""

        if "w" in query.keys():
            if query["w"] == "c":
                w = scoring.TF_IDF()
            elif query["w"] == "b":
                w = scoring.BM25F()
            else:
                w = TimeWeight()
        else:
            w = OurWeight()

        ret = {"r": [], "s": {}}
        with ix.searcher() as searcher:

            parser = MultifieldParser(["t", "d"], ix.schema,
                                      group=OrGroup).parse(unicode(s, "UTF-8"))
            results = searcher.search(parser, limit=100)
            for r in results:

                ret["r"].append({
                    "t": r["t"],
                    "d": r["d"],
                    "p": r["time"],
                    "l": r["link"],
                    "e": r["tags"],
                    "r": r["tags"]
                })

            corrector = searcher.corrector("d")
            for m in s.split():
                sug = corrector.suggest(m, limit=3)
                for s in sug:
                    if m not in ret["s"].keys():
                        ret["s"][m] = []
                    ret["s"][m].append(s)

            print ret["s"]
        f = StringIO()
        f.write(json.dumps(ret, indent=4, separators=(',', ': ')))
        length = f.tell()
        f.seek(0)
        self.send_response(200)
        encoding = sys.getfilesystemencoding()
        self.send_header("Content-type", "text/html; charset=%s" % encoding)
        self.send_header("Content-Length", str(length))
        self.end_headers()
        return f
コード例 #10
0
    def buildQueryParser(self):

        headersQueryParser = MultifieldParser(['content', 'headers'], schema=self.indexSchema, group=OrGroup)
        headersQueryParser.add_plugin(PlusMinusPlugin)

        return headersQueryParser
コード例 #11
0
    def buildQueryParser(self):

        expandedYahooKeywordsQueryParser = MultifieldParser(['content', 'expandedYqlKeywords'], schema=self.indexSchema, group=OrGroup)
        expandedYahooKeywordsQueryParser.add_plugin(PlusMinusPlugin)

        return expandedYahooKeywordsQueryParser
コード例 #12
0
ファイル: search.py プロジェクト: okute/ProfessionalWebsite
def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {'content': 1.0, 'title': 3.0}
    query_parser = MultifieldParser(['content', 'title'],
                                    schema=index_schema,
                                    fieldboosts=field_boosts,
                                    group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content':
            highlight(search_result['content'], search_terms, analyzer,
                      fragmenter, formatter),
            'url':
            search_result['url'],
            'title':
            search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count
コード例 #13
0
def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {
        'content': 1.0,
        'title': 3.0
    }
    query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter),
            'url': search_result['url'],
            'title': search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count
コード例 #14
0
ファイル: views.py プロジェクト: stikkas/rfss
def page_detail(request, pk):
    page = Page.objects.get(pk=pk)
    menu = page.menu
    comments = Comment.objects.filter(page=page).filter(
        visible=True).select_related('user')

    if menu.get_children():
        menu = menu.get_children()[0]

    menus = {'level2': [], 'level3': []}
    if menu.level == 2:
        for m in menu.get_siblings(include_self=True):
            if m == menu:
                m.active = True
            menus['level2'].append(m)
        menus['level3'] = menu.get_children()
    elif menu.level == 3:
        for m in menu.parent.get_siblings(include_self=True):
            if m == menu.parent:
                m.active = True
            menus['level2'].append(m)

        for m in menu.get_siblings(include_self=True):
            if m == menu:
                m.active = True
            menus['level3'].append(m)

    if page.keywords:
        # Open index dir
        ix = whoosh_index.open_dir(settings.PAGE_SEARCH_INDEX)
        # Make parser
        parser = MultifieldParser(['content', 'name'], schema=ix.schema)
        # Configure filter
        filter = Term('region', page.region.id)
        # Make query string
        qstr = page.keywords.replace('+', ' AND ').replace(' -', ' NOT ').replace(' | ', ' OR ')
        # Parse query string
        query = parser.parse(qstr)
        # And... search in index!
        with ix.searcher() as searcher:
            hits = ix.searcher().search(query, filter=filter, limit=None)
            similar_pages = Page.objects.filter(
                pk__in=[h.get('id') for h in hits if h.get('id') != page.id]
            ).order_by('?')[:5]

    context = {
        'page': page,
        'comments': comments,
        'menus': menus,
        'menu': menu,
        'similar_pages': locals().get('similar_pages')
    }

    # Comments
    if request.method == 'POST' and request.POST.get('submit_comment'):
        comment_form = CommentForm(request.POST)
        if comment_form.is_valid() and request.user.is_authenticated():
            comment_form.add_comment()
            context['comment_success'] = True
            # Clear form
            initial = {'user': request.user.id, 'page': page.id}
            comment_form = CommentForm(initial=initial)
    else:
        initial = {'user': request.user.id, 'page': page.id}
        comment_form = CommentForm(initial=initial)

    try:
        context['star_rating'] = StarRating.objects.get(page=page)
    except StarRating.DoesNotExist:
        context['star_rating'] = StarRating.objects.create(page=page)

    context['comment_form'] = comment_form

    return render(request, 'page_detail.html', context)
コード例 #15
0
    def buildQueryParser(self):

        descriptionQueryParser = MultifieldParser(['content', 'description'], schema=self.indexSchema, group=OrGroup)
        descriptionQueryParser.add_plugin(PlusMinusPlugin)
        
        return descriptionQueryParser
コード例 #16
0
    def buildQueryParser(self):

        keywordsQueryParser = MultifieldParser(["content", "keywords"], schema=self.indexSchema, group=OrGroup)
        keywordsQueryParser.add_plugin(PlusMinusPlugin)

        return keywordsQueryParser
コード例 #17
0
    def buildQueryParser(self):

        titleQueryParser = MultifieldParser(['content', 'title'], schema=self.indexSchema, group=OrGroup)
        titleQueryParser.add_plugin(PlusMinusPlugin)

        return titleQueryParser
コード例 #18
0
ファイル: searcher.py プロジェクト: svankie/clew
 def __init__(self):
     index = open_dir("search/index")
     self.searcher = index.searcher()
     self.parser = MultifieldParser(["title", "description", "date"],
                                    index.schema)
     self.parser.add_plugin(DateParserPlugin())
コード例 #19
0
def search_whoosh_files(filename_in):
    ngt1 = RegexTokenizer() | NgramFilter(4)
    l_aux_i = 0
    filename_aux = "dataset_match_" + filename_in
    ix1 = open_dir("index_" + filename_in)

    #aux max val for progress bar
    if filename_in == "jrc_person":
        max_val = 3000000
    else:
        max_val = 3000000

    widgets = [
        'Progress Searching ' + filename_in + ': ',
        Percentage(), ' ',
        Bar(marker='0', left='[', right=']'), ' ',
        ETA(), ' '
    ]
    pbar = ProgressBar(widgets=widgets, maxval=max_val)  #454000
    pbar.start()

    with ix1.searcher() as searcher:
        parser = MultifieldParser(['title'], ix1.schema)
        parser.remove_plugin_class(qparser.WildcardPlugin)
        parser.remove_plugin_class(qparser.PlusMinusPlugin)
        with open("dataset_non_match_" + filename_in + ".csv_tmp",
                  'w',
                  encoding="utf-8") as inW2:
            with open("dataset_match_" + filename_in + ".csv",
                      encoding="utf8") as csvfile:
                for row in csvfile:
                    l_aux_i = l_aux_i + 1
                    if l_aux_i % 20000 == 0:
                        print("Index search" + str(l_aux_i))
                        pbar.update(l_aux_i)
                    l_row_idx = row.split('|')[0]
                    l_row_aux = row.split('|')[1]
                    search_list = [token.text for token in ngt1(l_row_aux)]
                    if len(search_list) > 0:
                        l_row_str = random.sample(search_list, 1)
                        query = parser.parse(l_row_str[0])
                        results = searcher.search(query)
                        results_aux = []
                        for result in results:
                            if result['id'] != l_row_idx:
                                results_aux.append(
                                    [result['id'], result['title']])
                        if len(results_aux) > 0:
                            shuffle(results_aux)
                            line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                0][0] + "|" + results_aux[0][1]
                            inW2.write(line_new.strip() + '\n')
                            if len(results_aux) > 1:
                                if results_aux[1][0] != results_aux[0][0]:
                                    line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                        1][0] + "|" + results_aux[1][1]
                                    inW2.write(line_new.strip() + '\n')
                            if len(results_aux) > 2:
                                if results_aux[2][0] != results_aux[1][0]:
                                    line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                        2][0] + "|" + results_aux[2][1]
                                    inW2.write(line_new.strip() + '\n')
        pbar.finish()
コード例 #20
0
ファイル: searcher.py プロジェクト: svankie/clew
 def __init__(self):
     index = open_dir("search/index")
     self.searcher = index.searcher()
     self.parser = MultifieldParser(["title", "description", "date"], index.schema)
     self.parser.add_plugin(DateParserPlugin())