コード例 #1
0
def busquedaPorAsuntoCuerpo(aBuscar):        
    ix = open_dir("index")
    qp = MultifieldParser(["asunto","cuerpo"], schema=ix.schema)
    q = qp.parse(unicode(str(aBuscar)))
    s=ix.searcher()
    results = s.search(q)
    return results
コード例 #2
0
    def search(self, query):
        qp = MultifieldParser(['title', 'content'], schema=self.idx.schema)
        q = qp.parse(query)

        query_result = self.trainModel.predict(query)
        query_label = query_result['outputLabel']
        query_output_list = query_result['outputMulti']
        query_score = float()

        for output in query_output_list:
            if output['label'] == query_label:
                query_score = float(output['score'])

        #print("Query result : " + query_label + "\nQuery score : " + str(query_score))

        with self.idx.searcher() as searcher:
            search_results = searcher.search(q, limit=None)
            article_ids = self.find_matching_political_bias\
                            (search_results, query_label, query_score)

        for article_id in article_ids:
            #print(article_id)
            print(self.get_article_url(article_id))
            print(self.get_article_title(article_id))
            print(self.get_article_snippet(article_id))
            print(self.get_article_date(article_id))
コード例 #3
0
    def search(self, str):
        ix = open_dir(indexdir)

        with ix.searcher() as searcher:
            parser = MultifieldParser(['keywords', 'content', 'title'],
                                      ix.schema)
            query = parser.parse(str)

            results = searcher.search(query)

            return [list(result.values()) for result in results]
コード例 #4
0
    def search(self, query):
        ix = open_dir("news")
        if "s" in query.keys():
            s = query["s"][0]
        else:
            s = ""

        if "w" in query.keys():
            if query["w"] == "c":
                w = scoring.TF_IDF()
            elif query["w"] == "b":
                w = scoring.BM25F()
            else:
                w = TimeWeight()
        else:
            w = OurWeight()

        ret = {"r": [], "s": {}}
        with ix.searcher() as searcher:

            parser = MultifieldParser(["t", "d"], ix.schema,
                                      group=OrGroup).parse(unicode(s, "UTF-8"))
            results = searcher.search(parser, limit=100)
            for r in results:

                ret["r"].append({
                    "t": r["t"],
                    "d": r["d"],
                    "p": r["time"],
                    "l": r["link"],
                    "e": r["tags"],
                    "r": r["tags"]
                })

            corrector = searcher.corrector("d")
            for m in s.split():
                sug = corrector.suggest(m, limit=3)
                for s in sug:
                    if m not in ret["s"].keys():
                        ret["s"][m] = []
                    ret["s"][m].append(s)

            print ret["s"]
        f = StringIO()
        f.write(json.dumps(ret, indent=4, separators=(',', ': ')))
        length = f.tell()
        f.seek(0)
        self.send_response(200)
        encoding = sys.getfilesystemencoding()
        self.send_header("Content-type", "text/html; charset=%s" % encoding)
        self.send_header("Content-Length", str(length))
        self.end_headers()
        return f
コード例 #5
0
def apartado_a(palabras):
    ix = open_dir("Index")
    queryContent = []
    for palabra in palabras:
        queryContent.append(Term(""))
    query = palabras
    with ix.searcher() as searcher:
        parser = MultifieldParser(["titulo", "sinopsis"],
                                  ix.schema,
                                  group=qparser.OrGroup)
        query = parser.parse(query)
        print(query)
        results = searcher.search(query)
        for r in results:
            print(r)
コード例 #6
0
ファイル: main.py プロジェクト: adamyi/Geegle3
def search():
    start_time = time.time()
    form = request.form
    qstr = request.args.get('q')
    page = int(request.args.get('p', "1"))
    parser = MultifieldParser(['title', 'content'],
                              schema=ix.schema,
                              group=OrGroup)
    #termclass=MyFuzzyTerm)
    query = parser.parse(qstr)
    notes = []
    with ix.searcher() as searcher:
        corrected = searcher.correct_query(query, qstr)
        results = searcher.search(query)
        rel = results.estimated_length()
        if corrected.string.lower() != qstr.lower():
            crs = searcher.search(corrected.query)
            if crs.estimated_length() > rel:
                notes.append("Did you mean: " + corrected.string)
        results = searcher.search_page(query, page, terms=True)
        my_cf = ContextFragmenter(maxchars=20, surround=30, charlimit=256)
        results.order = SCORE
        results.fragmenter = my_cf
        # results.formatter = HtmlFormatter()
        rsp = [{
            "url": item["url"],
            "content": item.highlights("content"),
            "title": item["title"]
        } for item in results]
    # return json.dumps(rsp)
    # print(json.dumps(rsp))
    if rel == 0:
        notes.append("Sorry, no result for your query")
    else:
        elapsed_time = time.time() - start_time
        notes.append("%d results found in %.2f seconds" % (rel, elapsed_time))
    return render_template("result.html",
                           result=rsp,
                           query=qstr,
                           notes=notes,
                           nextpage=page + 1,
                           urlquery=urllib.quote_plus(qstr))
コード例 #7
0
ファイル: search.py プロジェクト: okute/ProfessionalWebsite
def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {'content': 1.0, 'title': 3.0}
    query_parser = MultifieldParser(['content', 'title'],
                                    schema=index_schema,
                                    fieldboosts=field_boosts,
                                    group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content':
            highlight(search_result['content'], search_terms, analyzer,
                      fragmenter, formatter),
            'url':
            search_result['url'],
            'title':
            search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count
コード例 #8
0
def search_whoosh_files(filename_in):
    ngt1 = RegexTokenizer() | NgramFilter(4)
    l_aux_i = 0
    filename_aux = "dataset_match_" + filename_in
    ix1 = open_dir("index_" + filename_in)

    #aux max val for progress bar
    if filename_in == "jrc_person":
        max_val = 3000000
    else:
        max_val = 3000000

    widgets = [
        'Progress Searching ' + filename_in + ': ',
        Percentage(), ' ',
        Bar(marker='0', left='[', right=']'), ' ',
        ETA(), ' '
    ]
    pbar = ProgressBar(widgets=widgets, maxval=max_val)  #454000
    pbar.start()

    with ix1.searcher() as searcher:
        parser = MultifieldParser(['title'], ix1.schema)
        parser.remove_plugin_class(qparser.WildcardPlugin)
        parser.remove_plugin_class(qparser.PlusMinusPlugin)
        with open("dataset_non_match_" + filename_in + ".csv_tmp",
                  'w',
                  encoding="utf-8") as inW2:
            with open("dataset_match_" + filename_in + ".csv",
                      encoding="utf8") as csvfile:
                for row in csvfile:
                    l_aux_i = l_aux_i + 1
                    if l_aux_i % 20000 == 0:
                        print("Index search" + str(l_aux_i))
                        pbar.update(l_aux_i)
                    l_row_idx = row.split('|')[0]
                    l_row_aux = row.split('|')[1]
                    search_list = [token.text for token in ngt1(l_row_aux)]
                    if len(search_list) > 0:
                        l_row_str = random.sample(search_list, 1)
                        query = parser.parse(l_row_str[0])
                        results = searcher.search(query)
                        results_aux = []
                        for result in results:
                            if result['id'] != l_row_idx:
                                results_aux.append(
                                    [result['id'], result['title']])
                        if len(results_aux) > 0:
                            shuffle(results_aux)
                            line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                0][0] + "|" + results_aux[0][1]
                            inW2.write(line_new.strip() + '\n')
                            if len(results_aux) > 1:
                                if results_aux[1][0] != results_aux[0][0]:
                                    line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                        1][0] + "|" + results_aux[1][1]
                                    inW2.write(line_new.strip() + '\n')
                            if len(results_aux) > 2:
                                if results_aux[2][0] != results_aux[1][0]:
                                    line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[
                                        2][0] + "|" + results_aux[2][1]
                                    inW2.write(line_new.strip() + '\n')
        pbar.finish()
コード例 #9
0
ファイル: searcher.py プロジェクト: svankie/clew
 def __init__(self):
     index = open_dir("search/index")
     self.searcher = index.searcher()
     self.parser = MultifieldParser(["title", "description", "date"],
                                    index.schema)
     self.parser.add_plugin(DateParserPlugin())