Beispiel #1
0
def search(query_text, expansion=False):
    with ix.searcher() as searcher:
        og = qparser.OrGroup.factory(0.8)
        parser = MultifieldParser(["question", "answer"], ix.schema, group=og)
        # parser.add_plugin(qparser.FuzzyTermPlugin())
        if expansion:
            query_expanded = ''
            # get synonyms for the query text
            syns = get_syns(query_text)

            for word in query_text.split():
                boosted_word = word + '^3'
                query_expanded = query_expanded + ' ' + boosted_word
            query_expanded = query_expanded + syns
            print(f'Search for: {query_expanded}')
            query = parser.parse(query_expanded)
        else:
            query = parser.parse(query_text)
            print(f'Search for: {query_text}')

        results = searcher.search(query, limit=1)

        # runtime = results.runtime
        # transform the data structure to a list of dictionary so it can be accessed while reader closed
        answer = ''
        for passage in results:
            answer += ''.join([passage['question'], '\n', passage['answer']])
        return answer
Beispiel #2
0
def search(query_text, expansion=True):
    with ix.searcher() as searcher:
        og = qparser.OrGroup.factory(0.8)
        parser = MultifieldParser(["title", "content"], ix.schema, group=og)
        # parser.add_plugin(qparser.FuzzyTermPlugin())
        if expansion:
            query_expanded = ''
            # get synonyms for the query text
            syns = get_syns(query_text)

            for word in query_text.split():
                boosted_word = word + '^3'
                query_expanded = query_expanded + ' ' + boosted_word
            query_expanded = query_expanded + syns
            print(f'Search for: {query_expanded}')
            query = parser.parse(query_expanded)
        else:
            query = parser.parse(query_text)
            print(f'Search for: {query_text}')

        results = searcher.search(query, limit=20)
        # print(results[0:2])

        runtime = results.runtime
        # transform the data structure to a list of dictionary so it can be accessed while reader closed
        result_list = []
        for passage in results:
            result_list.append({
                'title': passage['title'],
                'url': passage['url'],
                'content': passage['content']
            })
        return result_list, runtime
Beispiel #3
0
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None,
           orderby='-creation_date'):

    search_engine = get_search_engine('resource')
    search_result = {}

    if pagenum < 1:
        pagenum = 1

    with search_engine.searcher() as searcher:

        parser = MultifieldParser(search_engine.default_search_fields, searcher.schema)

        user_q = querytext and parser.parse(querytext) or Every()
        user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby)
        hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs)

        if querytext and hits.is_empty():

            correction_q = parser.parse(querytext)
            corrected = searcher.correct_query(correction_q, querytext)

            if corrected.query != correction_q:
                querytext = corrected.string
                search_result['corrected_q'] = querytext

                user_q, search_kwargs = build_search_kwargs(corrected.query, request, scope, staff, orderby)
                hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs)

        search_engine.prepare_search_response(search_result, hits, pagenum, maxresults)
        search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff)
        add_absolute_urls(search_result['results'], request)

    return search_result
Beispiel #4
0
def find(cmd, criteria, reindex=False):
    from whoosh.qparser import MultifieldParser
    if reindex:
        _create_index(cmd.cli_ctx)

    try:
        ix = _get_index(cmd.cli_ctx)
    except ValueError:
        # got a pickle error because the index was written by a different python version
        # recreate the index and proceed
        _create_index(cmd.cli_ctx)
        ix = _get_index(cmd.cli_ctx)

    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=_get_schema())

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria)
        q = qp.parse(expanded_query)

    with ix.searcher() as searcher:
        from whoosh.highlight import UppercaseFormatter, ContextFragmenter
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
Beispiel #5
0
class Searcher(object):
    """
    Assigned to a Model class as ``search_query``, which enables text-querying.
    """
    def __init__(self, model_class, primary, index, session=None):
        self.model_class = model_class
        self.primary = primary
        self.index = index
        self.session = session
        self.searcher = index.searcher()
        fields = set(index.schema._fields.keys()) - set([self.primary])
        self.parser = MultifieldParser(list(fields), index.schema)

    def __call__(self, query, limit=20, pagenum=1, pagelen=20):
        session = self.session
        # When using Flask, get the session from the query attached to the model class.
        if not session:
            session = self.model_class.query.session

        if not pagenum:
            results = self.index.searcher().search(self.parser.parse(query),
                                                   limit=limit)
        else:
            results = self.index.searcher().search_page(
                self.parser.parse(query), pagenum=pagenum,
                pagelen=pagelen) >> log
        keys = [x[self.primary] for x in results]
        primary_column = getattr(self.model_class, self.primary)
        return session.query(self.model_class).filter(primary_column.in_(keys))
Beispiel #6
0
def search(searchwords, search_fields , index_file):
    ix = index.open_dir(index_file)  
    # facet = sorting.FieldFacet("comment_num", reverse=True)
    searcher = ix.searcher()  

    qp = MultifieldParser(search_fields, schema=ix.schema)

    results = []
    kws = []
    if './index/farm_products_index' in index_file:
        for kw in cut_for_search(searchwords):
            q = qp.parse(kw)
            res = list(searcher.search(q, limit=50))
            if len(res):
                results.append(res)
                kws.append(kw)

    elif './index/job_index' in index_file:
        jieba.load_userdict('./data/cut_words.txt') 
        t = np.array(list(cut_for_search(searchwords)))
        t_p = map(lambda x: "'"+str(x)+"'",t)
        job, city = get_search_words(','.join(t_p))
        s = ' '.join(job)+' ' if job else ''
        s += ' '.join(city) if city else ''
        # cuted_s = ' '.join(job)
        q = qp.parse(s)
        r = searcher.search(q, terms=True, limit=50)
        res = list(r)
        if len(res):
            results.append(res)
            kws.append('test')
        
    return results,kws
Beispiel #7
0
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None,
           orderby='-creation_date'):

    search_engine = get_search_engine('resource')
    search_result = {}

    if pagenum < 1:
        pagenum = 1

    with search_engine.searcher() as searcher:

        parser = MultifieldParser(search_engine.default_search_fields, searcher.schema)

        user_q = querytext and parser.parse(querytext) or Every()
        user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby)
        hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs)

        if querytext and hits.is_empty():

            correction_q = parser.parse(querytext)
            corrected = searcher.correct_query(correction_q, querytext)

            if corrected.query != correction_q:
                querytext = corrected.string
                search_result['corrected_q'] = querytext

                user_q, search_kwargs = build_search_kwargs(corrected.query, request, scope, staff, orderby)
                hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs)

        search_engine.prepare_search_response(search_result, hits, pagenum, maxresults)
        search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff)
        add_absolute_urls(search_result['results'], request)

    return search_result
Beispiel #8
0
def find(criteria, reindex=False):
    """
    Search for Azure CLI commands
    :param str criteria: Query text to search for.
    :param bool reindex: Clear the current index and reindex the command modules.
    :return:
    :rtype: None
    """
    if reindex:
        _create_index()

    ix = _get_index()
    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=schema
    )

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        q = qp.parse(" OR ".join(criteria))

    with ix.searcher() as searcher:
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
Beispiel #9
0
def find(cmd, criteria, reindex=False):
    from whoosh.qparser import MultifieldParser
    if reindex:
        _create_index(cmd.cli_ctx)

    try:
        ix = _get_index(cmd.cli_ctx)
    except ValueError:
        # got a pickle error because the index was written by a different python version
        # recreate the index and proceed
        _create_index(cmd.cli_ctx)
        ix = _get_index(cmd.cli_ctx)

    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=_get_schema()
    )

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria)
        q = qp.parse(expanded_query)

    with ix.searcher() as searcher:
        from whoosh.highlight import UppercaseFormatter, ContextFragmenter
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
Beispiel #10
0
class Searcher(object):
    """
  Assigned to a Model class as ``search_query``, which enables text-querying.
  """

    def __init__(self, model_class, primary, index):
        self.model_class = model_class
        self.primary = primary
        self.index = index
        self.searcher = index.searcher()
        fields = set(index.schema._fields.keys()) - set([self.primary])
        self.parser = MultifieldParser(list(fields), index.schema)

    def __call__(self, query, limit=None):
        """API similar to SQLAlchemy's queries.
    """
        session = self.model_class.query.session

        results = self.index.searcher().search(self.parser.parse(query), limit=limit)
        keys = [x[self.primary] for x in results]
        if not keys:
            # Dummy request...
            return session.query(self.model_class).filter("uid = -1")
        else:
            primary_column = getattr(self.model_class, self.primary)
            return session.query(self.model_class).filter(primary_column.in_(keys))

    def search(self, query, limit=None):
        """New API: returns both whoosh records and SA models."""
        # TODO: highly suboptimal

        session = self.model_class.query.session
        hits = self.index.searcher().search(self.parser.parse(query), limit=limit)
        for hit in hits:
            yield (hit, session.query(self.model_class).get(hit[self.primary]))
Beispiel #11
0
def search_results(ix, search_query, fields):
    qpo = MultifieldParser(fields, schema=ix.schema, group=qparser.OrGroup)
    qpa = MultifieldParser(fields, schema=ix.schema)
    qo = qpo.parse(search_query)
    qa = qpa.parse(search_query)
    data = []
    data_index = 0

    with ix.searcher() as s:
        resultsa = s.search(qa)
        resultso = s.search(qo)
        for hit in resultsa:
            data.append(dict(**hit))
            context = str()
            for field in fields:
                if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context):
                    context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field))
            data[data_index]["context"] = context
            data_index += 1

        for hit in resultso:
            found = False
            for hita in resultsa:
                if hit["id"] == hita["id"]:
                    found = True
            if not found:
                data.append(dict(**hit))
                context = str()
                for field in fields:
                    if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context):
                        context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field))
                data[data_index]["context"] = context
                data_index += 1
    return data
Beispiel #12
0
def make_search_service(search_text):
  charmap = charset_table_to_dict(default_charset)
  custom_analyzers = StemmingAnalyzer()

  index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir')
  myindex = open_dir(index_path)
  qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8})
  qstring = search_text
  q = qp.parse(qstring)

  results_list = []

  myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0))
  with myindex.searcher(weighting=myWeighting) as s:
    results = s.search(q, limit=30, terms=True)

    #forse cercavi e risultati relativi a
    corrected = s.correct_query(q, qstring)
    did_you_mean = str
    result_for = str
    if corrected.query != q:
      if len(results) < 1:
        results = s.search(qp.parse(corrected.string), limit=30, terms=True)
        result_for = corrected.string
      else:
        did_you_mean = corrected.string


    #query expansion
    keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)]
    if not keywords and keywords == " ":
      query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords))
      results_keyword = s.search(query_keyword, limit=30, terms=True)
      results.upgrade_and_extend(results_keyword)

    #sorting
    key_sort = lambda result: result.score
    results = sorted(results, key=key_sort, reverse=True)

    
    for ris in results:
      result = {}
      result['title'] = ris['title']
      result['url'] = ris['url']
      result['id'] = ris['ID']
      result['highlight'] = ris.highlights("textdata")
      results_list.append(result)


    #per calcolo precisione e recall
    id_results = [ris['id'] for ris in results_list[:10]]

    return {
      'search_text': search_text,
      'results': results_list, 
      'did_you_mean': did_you_mean,
      'result_for': result_for,
      'results_ids': id_results
    }
Beispiel #13
0
    def perform_search(self,
                       schema,
                       field: str,
                       query: str,
                       page: int = 1,
                       pagelen: int = 20):
        """
        Performs a query of the index from the given field and query string
        :param schema:
        :param field: String. Index field
        :param query: String.
        :param page: int. starting page of results to return results from
        :param pagelen: int. number of results to display per page
        :return: list. results
        """
        if field is '':

            # Get All Schema fields
            fields = self.get_fields(schema=schema)

            results_dict = {}

            with self.schemas[schema].searcher() as searcher:
                last_page = False
                while not last_page:
                    parser = MultifieldParser(fields,
                                              self.schemas[schema].schema)
                    search_query = parser.parse(query)
                    results = searcher.search_page(search_query, page, pagelen)

                    if results.total > 0:
                        for doc in range(results.pagelen):
                            results_dict[results.docnum(
                                doc)] = results.results.fields(doc)

                    last_page = results.is_last_page()
                    page += 1

            return results_dict

        else:
            results_dict = {}

            with self.schemas[schema].searcher() as searcher:
                last_page = False
                while not last_page:
                    parser = QueryParser(field, self.schemas[schema].schema)
                    search_query = parser.parse(query)
                    results = searcher.search_page(search_query, page, pagelen)

                    if results.total > 0:
                        for doc in range(results.pagelen):
                            results_dict[results.docnum(
                                doc)] = results.results.fields(doc)

                    last_page = results.is_last_page()
                    page += 1

            return results_dict
Beispiel #14
0
def search(request):
    indexNewsObject = IndexNews()
    ix = indexNewsObject.ix
    if request.method == 'POST':
        inputQuery = request.POST['inputQuerySearchPage']
        request.session['inputQuery'] = inputQuery
        if inputQuery == '':
            context = {
                'message' : 'لطفا عبارت مورد نظر خود را وارد کنید'
            }
            return render(request,'searchPage/searchPage.html',context=context)
        else:
            # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup)
            # queryParser = MultifieldParser(['title','content'],schema=ix.schema,group=OrGroup)
            queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema)
            query = queryParser.parse(inputQuery)
            with ix.searcher(weighting=scoring.BM25F()) as searcher:
                results = searcher.search(query,terms=True,limit=None)
                
                #for customize html tag form highlight matched terms 
                htmlFormat = highlight.HtmlFormatter('b')
                results.formatter = htmlFormat
                results.fragmenter.maxchars = 300
                results.fragmenter.surround = 150
                paginator = Paginator(results,15)
                page = request.GET.get('page')
                resultWithPage = paginator.get_page(page)
                context = {
                'results':resultWithPage,
                'inputQuery':inputQuery
                }
                return render(request,'searchPage/searchPage.html',context=context)
    else:
        inputQuery = request.session['inputQuery']
        # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup)
        queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema)
        query = queryParser.parse(inputQuery)
        with ix.searcher(weighting=scoring.BM25F()) as searcher:
            results = searcher.search(query,terms=True,limit=None)

            #for customize html tag form highlight matched terms 
            htmlFormat = highlight.HtmlFormatter('b')
            results.formatter = htmlFormat
            results.fragmenter.maxchars = 300
            results.fragmenter.surround = 150
            paginator = Paginator(results,15)
            page = request.GET.get('page')
            resultWithPage = paginator.get_page(page)
            context = {
            'results':resultWithPage,
            'inputQuery':inputQuery
            }
            return render(request,'searchPage/searchPage.html',context=context)
Beispiel #15
0
    def search(self, query_list, fields=None):

        with self.ix.searcher() as searcher:

            query_list2 = []
            for qq in query_list:
                if qq=='AND' or qq=='OR':
                    query_list2.append(qq)
                else:
                    query_list2.append(qq.lower())
            query_string = " ".join(query_list2)

            query = None
            if ":" in query_string:
                # If the user DOES specify a field,
                # setting the fields determines what fields
                # are searched with the free terms (no field)
                fields = ['title', 'content','owner_name','owner_email','github_user']
                query = MultifieldParser(fields, schema=self.ix.schema)
                est = pytz.timezone('America/New_York')
                query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow())))
                query.add_plugin(GtLtPlugin())
                try:
                    query = query.parse(query_string)
                except:
                    # Because the DateParser plugin is an idiot
                    query_string2 = re.sub(r':(\w+)',':\'\g<1>\'',query_string)
                    try:
                        query = query.parse(query_string2)
                    except:
                        print("parsing query %s failed"%(query_string))
                        print("parsing query %s also failed"%(query_string2))
                        query = query.parse('')

            else:
                # If the user does not specify a field,
                # these are the fields that are actually searched
                fields = ['url','title', 'content','owner_name','owner_email','github_user']
                query = MultifieldParser(fields, schema=self.ix.schema)
                est = pytz.timezone('America/New_York')
                query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow())))
                query.add_plugin(GtLtPlugin())
                try:
                    query = query.parse(query_string)
                except:
                    print("parsing query %s failed"%(query_string))
                    query = query.parse('')
            parsed_query = "%s" % query
            print("query: %s" % parsed_query)
            results = searcher.search(query, terms=False, scored=True, groupedby="kind")
            search_result = self.create_search_result(results)

        return parsed_query, search_result
 def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(
         weighting=BM25F(
             field_B={ 'name_B': float( tool_name_boost ),
                       'section_B': float( tool_section_boost ),
                       'description_B': float( tool_description_boost ),
                       'labels_B': float( tool_label_boost ),
                       'stub_B': float( tool_stub_boost ),
                       'help_B': float( tool_help_boost ) }
         )
     )
     # Set query to search name, description, section, help, and labels.
     parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema )
     # Hyphens are wildcards in Whoosh causing bad things
     if q.find( '-' ) != -1:
         q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] )
     # Perform tool search with ngrams if set to true in the config file
     if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ):
         hits_with_score = {}
         token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) )
         ngrams = [ token.text for token in token_analyzer( q ) ]
         for query in ngrams:
             # Get the tool list with respective scores for each qgram
             curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) )
             for i, curr_hit in enumerate( curr_hits ):
                 is_present = False
                 for prev_hit in hits_with_score:
                     # Check if the tool appears again for the next qgram search
                     if curr_hit[ 'id' ] == prev_hit:
                         is_present = True
                         # Add the current score with the previous one if the
                         # tool appears again for the next qgram
                         hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ]
                 # Add the tool if not present to the collection with its score
                 if not is_present:
                     hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i)
         # Sort the results based on aggregated BM25 score in decreasing order of scores
         hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True )
         # Return the tool ids
         return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ]
     else:
         # Perform the search
         hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) )
         return [ hit[ 'id' ] for hit in hits ]
Beispiel #17
0
 def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(
         weighting=BM25F(
             field_B={'name_B': float(tool_name_boost),
                      'section_B': float(tool_section_boost),
                      'description_B': float(tool_description_boost),
                      'labels_B': float(tool_label_boost),
                      'stub_B': float(tool_stub_boost),
                      'help_B': float(tool_help_boost)}
         )
     )
     # Set query to search name, description, section, help, and labels.
     parser = MultifieldParser(['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema)
     # Hyphens are wildcards in Whoosh causing bad things
     if q.find('-') != -1:
         q = (' ').join([token.text for token in self.rex(to_unicode(q))])
     # Perform tool search with ngrams if set to true in the config file
     if (tool_enable_ngram_search is True or tool_enable_ngram_search == "True"):
         hits_with_score = {}
         token_analyzer = StandardAnalyzer() | analysis.NgramFilter(minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize))
         ngrams = [token.text for token in token_analyzer(q)]
         for query in ngrams:
             # Get the tool list with respective scores for each qgram
             curr_hits = searcher.search(parser.parse('*' + query + '*'), limit=float(tool_search_limit))
             for i, curr_hit in enumerate(curr_hits):
                 is_present = False
                 for prev_hit in hits_with_score:
                     # Check if the tool appears again for the next qgram search
                     if curr_hit['id'] == prev_hit:
                         is_present = True
                         # Add the current score with the previous one if the
                         # tool appears again for the next qgram
                         hits_with_score[prev_hit] = curr_hits.score(i) + hits_with_score[prev_hit]
                 # Add the tool if not present to the collection with its score
                 if not is_present:
                     hits_with_score[curr_hit['id']] = curr_hits.score(i)
         # Sort the results based on aggregated BM25 score in decreasing order of scores
         hits_with_score = sorted(hits_with_score.items(), key=lambda x: x[1], reverse=True)
         # Return the tool ids
         return [item[0] for item in hits_with_score[0:int(tool_search_limit)]]
     else:
         # Perform the search
         hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit))
         return [hit['id'] for hit in hits]
Beispiel #18
0
def generic(idx, qs=None, q=None, limit=5, parser=None, page=1):
    if qs is q is None:
        raise ValueError('cannot have a null querystring and query')

    if parser is None:
        parser = MultifieldParser(
                ['title', 'keywords', 'summary', 'content', 'author'], idx.schema, group=OrGroup)

    # add better date parsing support
    parser.add_plugin(DateParserPlugin())
    parser.remove_plugin_class(WildcardPlugin)

    with idx.searcher() as search:
        # generate the Query object
        if qs:
            query = parser.parse(qs)
        else:
            query = q

        facet = MultiFacet()
        facet.add_score()
        facet.add_field('modified', reverse=True)
        facet.add_field('title')

        results = search.search_page(query, pagenum=page, sortedby=facet, pagelen=limit)
        res = clean_results(idx, results, query)

        # pagination attributes on `search_page` method
        res.page_number = results.pagenum   # current page number
        res.page_total = results.pagecount  # total pages in results
        res.offset = results.offset         # first result of current page
        res.pagelen = results.pagelen       # the number of max results per page

    return res
Beispiel #19
0
def search_documents(filter):
    results = None

    # Check for existing index
    dir_path = os.path.join(DATA_DIR, 'index')

    if not os.path.exists(dir_path) or not Index.exists_in(dir_path):
        return None

    index = Index.open_dir(dir_path)

    if filter.startswith('tags:'):
        fields = ['tags']
        filter = filter[5:]
    else:
        fields = ['path', 'content']

    parser = MultifieldParser(fields, schema=index.schema)
    search_query = parser.parse(unicode(filter))

    # Try documents search
    try:
        searcher = index.searcher(closereader=False)

        return searcher.search(search_query,
            collapse=[sorting.FieldFacet('path'), sorting.FieldFacet('content')],
            collapse_order=sorting.FieldFacet('revision', reverse=True),
            sortedby=[sorting.FieldFacet('path'), sorting.FieldFacet('date', reverse=True)]
        )
    finally:
        searcher.close()

    return results
Beispiel #20
0
def search(page):
    search = request.args['q']
    storage = FileStorage(conf.INDEX_DIR)
    index = storage.open_index(indexname=conf.INDEX_NAME)
    qp = MultifieldParser(['title', 'text', 'tags'], schema=index.schema)
    q = qp.parse(search)
    results = []
    with index.searcher() as searcher:
        results = searcher.search_page(q, page, pagelen=conf.PAGE_SIZE)
        # Get real posts
        post_ids = ",".join(["'%s'" % p['post_id'] for p in results])
        if post_ids:
            ghost = get_melmelboo_connection()
            with ghost.cursor() as ghost_cur:
                ghost_cur.execute("SELECT title, feature_image, html, slug "
                                  "FROM posts WHERE id IN (%s)" % post_ids)
                posts = [{
                    'type': "post",
                    'title': i[0],
                    'image': i[1],
                    'excerpt': excerpt(i[2]),
                    'url': "/blog/" + i[3]
                } for i in ghost_cur.fetchall()]
            ghost.close()
        else:
            posts = []
    return render_template("search.html", posts=posts, search=search)
Beispiel #21
0
def search(keyword):
	logging.debug('searching for: %s ... ',keyword.split())
	"""Search deal from indexed file"""
	ix = open_dir('C:\crawlData\indexed')
	"""Open indexed file"""
	qp =  MultifieldParser(["title", "link"], schema=ix.schema)
	# qp.remove_plugin_class(PhrasePlugin)
	# qp.add_plugin(SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?"))	
	# qp.add_plugin(PhrasePlugin(expr='"(?P<text>.*?)"(~(?P<slop>[1-9][0-9]*))?'))
	q = qp.parse(keyword)
	with ix.searcher() as s:
		results = s.search(q, limit=10)
		items = '['	
		for hit in results:
			items = (items + '{"title":"' 
					+ string.replace(hit['title'],'\n',' ') 
                                        + '","link":"' + hit['link']
                                        + '","pSale":"' + string.replace(hit['pSale'],'\n',' ')
                                        + '","pRegular":"' + string.replace(hit['pRegular'],'\n',' ')
                                        + '","img":"' + hit['img'] + '"},')

			# print('matched term: ',hit.matched_terms())
		items = items[0:len(items)-1] + ']'
		# print(items)
	return items
Beispiel #22
0
def parse(text, schema=SCHEMA):
    """
    parse(text[, schema=SCHEMA])
    
    Analisa e trata o texto em ``text`` de acordo com o ``schema``
    do índice de documentos.
     
    .. code-block:: python
    
        >>> from storyline.engine.query import parse
        >>> from storyline.engine.schema import get_schema
        >>>
        >>> SCHEMA = get_schema()
        >>> parse("Mestre", SCHEMA)
        Or([Term('title', u'mestr'), Term('content', u'mestr')])
    
    :param text: Consulta feita pelo usuário.
    :type text: str
    :param schema: Schema do índice de documentos.
    :type schema: Schema
    :returns: Query com termos e operadores.
    """
    
    try:
        from whoosh.qparser import MultifieldParser
    except ImportError:
        print "Ocorreu um erro na importação do módulo whoosh.qparser."
        
    qp = MultifieldParser(["title", "content"], schema, None)
    
    return qp.parse(text)
    def __call__(self, query, limit=None, fields=None, or_=False):
        if fields is None:
            fields = self._all_fields

        group = OrGroup if or_ else AndGroup
        parser = MultifieldParser(fields, self._index.schema, group=group)
        return self._index.searcher().search(parser.parse(query), limit=limit)
Beispiel #24
0
def search_whoosh_index_headline(query, paths):
    if not paths:
        return []
    ix = get_whoosh_index()
    parser = MultifieldParser(['content', 'title', 'abstract'], ix.schema)
    q = parser.parse(query)

    allow_q = Or([Term('path', path) for path in paths])

    res = []

    with ix.searcher() as searcher:
        results = searcher.search(q,
                                  filter=allow_q,
                                  limit=len(paths),
                                  terms=True)
        for hit in results:
            res.append({
                # 'title': hit['title'],
                'short_url':
                hit['path'],
                'highlights':
                u' [...] '.join(
                    filter(None, [
                        hit.highlights("title", top=5),
                        hit.highlights("abstract", top=5),
                        hit.highlights("content", top=5)
                    ]))
            })

    return res
Beispiel #25
0
def search(querystring, language_code):
    ix = LanguageIndex(settings.WHOOSH_INDEX_PATH, language_code, _get_schema()).load()
    # parser = QueryParser('content', ix.schema)
    parser = MultifieldParser(['title', 'keywords', 'content'], ix.schema)  # fieldboosts={'title':5, 'keywords':4, 'content':1})
    parser.remove_plugin_class(WildcardPlugin)  # remove unused feature for better performance
    query = parser.parse(querystring)
    # print(parser, query, querystring)

    result = {
        'results': [],
    }

    with ix.searcher() as searcher:
        results = searcher.search(query)
        # print(results)
        # import pdb; pdb.set_trace()

        # collect results
        for hit in results:
            my_hit = {}
            # my_hit['pos'] = hit.pos
            # my_hit['rank'] = hit.rank
            # my_hit['docnum'] = hit.docnum
            my_hit['score'] = hit.score
            my_hit['object'] = Article.objects.get(code=hit.fields()['code'])
            #.exclude(published=False).exclude(release_date__gte=datetime.today())
            # my_hit['object']['is_visible'] = True
            result['results'].append(my_hit)
            # print(hit.pos, hit.rank, hit.docnum, hit.score, hit)

    return result
Beispiel #26
0
class Index:
	def __init__(self, path='~/Music/iTunes/iTunes Music Library.xml', folder='~/Library/Application Support/Share my tunes'):
		self.path = os.path.expanduser(path)
		self.schema = Schema(
			trackId = ID(stored=True),
			name=TEXT(stored=True),
			artist=TEXT(stored=True),
			album=TEXT(stored=True),
			genre=KEYWORD(stored=True),
			location=STORED,
			trackNumber=STORED,
			bitRate=ID(stored=True),
			artwork=KEYWORD(stored=True)
			)
		self.parser = MultifieldParser(["name", "album", "artist"], schema = self.schema)
		self.folder = "%s/index" % os.path.expanduser(folder)
		self.empty = not whoosh.index.exists_in(self.folder)
		self.ix = None
	def index(self):
		if self.empty:
			if not os.path.exists(self.folder):
				os.makedirs(self.folder)
			st = FileStorage(self.folder)
			ix = st.create_index(self.schema)
			w = ix.writer()
			w.add_document(name = u"beuha")
			pipe = file.ID3Filter()
			#[TODO] using itunes info for artwork?
			cpt = 0
			for track in pipe(ItunesParser(self.path)):
				if track['album'] != None : 
					album = track['album'].encode('ascii', 'ignore')
				else:
					album = ""
				#print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore')
				if cpt % 20 == 0:
					print "\n%i " %cpt,
				print '#',
				#print track['album'], track['name']
				w.add_document(
					trackId = track['trackId'], name=track['name']
					,artist=track['artist'], album=track['album'],
					genre=track['genre'], location=track['location'],
					artwork=boolean(track['artwork']),
					trackNumber=track['trackNumber'], bitRate=track['bitRate']
				)
				#if cpt % 100 == 1:
				#	w.commit()
				cpt += 1
			print "\n\n%i tracks indexed" % cpt
			w.commit()
			ix.optimize()
			ix.close()
		else :
			print "already indexed"
	def query(self, query):
		if self.ix == None:
			self.ix = FileStorage(self.folder).open_index()
		q = self.parser.parse(query)
		return self.ix.searcher().search(q, sortedby=("album", "name"), limit=None)
Beispiel #27
0
def getdocs():
	params = dict(request.args.items())
	search_terms = params['NPS'].split(quails.DELIMITER)
	try:
		ix = index.open_dir("indexQ")
		
	except:
		return jsonify(failure="Index not found.  Ensure that index exists and tries again.")

	qp = MultifieldParser(["title","body"], schema=ix.schema)

	queries = []
	for term in search_terms:
		queries.append(qp.parse(term))

	docs = OrderedDict()
	hit_list = []
	with ix.searcher() as searcher:
		
		for query in queries:
			
			results=searcher.search(query)	
	
			for result in results: 
				hit_list.append((str(query),result['title']))

	return jsonify(results=hit_list)
Beispiel #28
0
class FTSSearcher(object):
    """用于检索
    """

    def __init__(self, storage=default_storage):
        self._fragmenter_maxchars = 70
        self._fragmenter_surround = 70
        self._formatter = MarkFormatter()
        schema = Schema(news_id=ID(unique=True, stored=True),
                        title=TEXT(field_boost=2.0, analyzer=analyzer),
                        content=TEXT(analyzer=analyzer))
        self._ix = storage.open_index(schema=schema)
        self._parser = MultifieldParser(["title", "content"], self._ix.schema)
        self._searcher = self._ix.searcher()

    def search(self, query_string, limit=10):
        """搜索文件
        """
        # refresh searcher
        query_string = util.str2unicode(query_string)

        query = self._parser.parse(query_string)
        search_results = self._searcher.search(query, limit=limit)

        # 设置highlight属性
        search_results.formatter = self._formatter
        search_results.fragmenter.maxchars = self._fragmenter_maxchars
        search_results.fragmenter.surround = self._fragmenter_surround

        return search_results

    def close(self):
        self._searcher.close()
Beispiel #29
0
def keywords(request):
    query = request.GET.get('q', '')
    if not query:
        return render(request, 'search/keywords.html', {'page_name': 'search.keywords'})

    qtext = get_tokenized_query(query)
    print qtext

    idx_dir = os.path.join(settings.BASE_DIR, 'search/lagou_idx')
    ix = open_dir(idx_dir)
    searcher = ix.searcher()

    parser = MultifieldParser(["name", "com_name", 'city'], schema=ix.schema)
    q = parser.parse(qtext)

    plen = 100
    results = searcher.search(q, limit=plen)

    total = len(results)
    got = results.scored_length()
    numterms = 100
    if got < 10:
        numterms = 10
    elif got < 100:
        numterms = 50

    keywords = [(kw, score) for kw, score in results.key_terms("desc", docs=got, numterms=numterms)]

    return render(request, 'search/keywords.html',
                  {'page_name': 'search.keywords',
                   'query': query,
                   'total': total,
                   'got': got,
                   'keywords': keywords,
                  })
Beispiel #30
0
def search_whoosh_index(query, offset=0, limit=10, *args, **kwargs):
    ix = get_whoosh_index()
    parser = MultifieldParser(
        ['content', 'authors', 'tags', 'title', 'abstract'], ix.schema)
    # user query
    q = parser.parse(query)

    if not query:
        q = Every()
        print 'arch'

    allow_q = And([Term(key, value) for key, value in kwargs.iteritems()])
    # parse remaining args
    res = []
    count = 0
    offset = int(offset)
    limit = int(limit)
    right = offset + limit
    # restrict_q = Or([Term("path", u'%s' % d.id) for d in qs])
    #print 'query', q, allow_q, kwargs
    with ix.searcher() as searcher:
        # From WHOOSH documentation:
        # > Currently, searching for page 100 with pagelen of 10 takes the same amount of time as using Searcher.search()
        #   to find the first 1000 results
        results = searcher.search(q, filter=allow_q, limit=right, terms=True)
        count = len(results)

        for hit in list(results)[offset:]:
            res.append({
                # 'title': hit['title'],
                'short_url': hit['path'],
                'highlights': hit.highlights("content", top=5)
            })
    # @todo filter by empty highlight strings
    return {'results': res, 'count': count}
Beispiel #31
0
def search(q, limit=None):
#    q = str(q)
    ix = open_dir(DIRECTORY, NAME)
    with ix.searcher() as searcher:
        qp = MultifieldParser(fieldnames=['title',
                                          'author',
                                          'tags',
                                          'notes',
                                          'text',
                                          'source',
#                                          'cached',
                                          'year'],
                              fieldboosts={'title':  7,
                                           'year':   6,
                                           'author': 10,
                                           'tags':   4,
                                           'notes':  2,
                                           'text':   1},
                              schema=ix.schema)

        # Whoosh chokes on queries with stop words, so remove them.
        q = remove_stopwords(q)

        q = qp.parse(q)
        for hit in searcher.search(q, limit=limit):
            yield hit
Beispiel #32
0
def search_commodity():
    from shop import app
    ix = open_dir(app.config.get("INDEX_DIR"))
    searcher = ix.searcher()
    mparser = MultifieldParser(["content", "title"], schema=ix.schema)

    query_raw = request.args.get('q', '')
    if query_raw:
        query = mparser.parse(unicode(query_raw.lower()))
        results = searcher.search(query)

        result_id = []
        for result in results:
            result_id.append(int(result['id']))

        result_id = list(set(result_id))
        wq = None
        for rid in result_id:
            if not wq:
                wq = Q(id=rid)
            else:
                wq |= Q(id=rid)
        if wq:
            coms = Commodity.select().where(wq)
        else:
            coms = []
    else:
        coms = Commodity.select()
    category = int(request.args.get('c', '0'))
    if category and category != 1:
        coms = [c for c in coms if c.is_category(category)]
    return render_template('core/com_list.html', commodities=coms)
Beispiel #33
0
def build_keywords_query(keywords):
    """
    Build parsers for a query.

    :param MultiDict keywords: The search texts keyed by scope key. If empty,
        the query will match every documents.
    """
    queries = []
    if keywords:
        composer = current_app.config['KERKO_COMPOSER']
        text_plugins = [PhrasePlugin(), GroupPlugin(), OperatorsPlugin()]
        for key, value in keywords.items(multi=True):
            fields = [
                spec.key for spec in composer.fields.values()
                if key in spec.scopes
            ]
            if not fields:
                raise KeyError  # No known field for that scope key.
            parser = MultifieldParser(fields,
                                      schema=composer.schema,
                                      plugins=text_plugins)
            queries.append(parser.parse(value))
    else:
        queries.append(Every())
    return And(queries)
Beispiel #34
0
def search(index_name, text, scope=None, limit=20):
    index_dir = get_index_path(index_name)
    ix = open_dir(index_dir)

    results = None
    out = []
    with ix.searcher() as searcher:
        parser = MultifieldParser(["title", "content"], ix.schema)
        parser.remove_plugin_class(FieldsPlugin)
        parser.remove_plugin_class(WildcardPlugin)
        query = parser.parse(text)

        filter_scoped = None
        if scope:
            filter_scoped = Prefix("path", scope)
        results = searcher.search(query, limit=limit, filter=filter_scoped)

        for r in results:
            title_highlights = r.highlights("title")
            content_highlights = r.highlights("content")
            out.append(
                frappe._dict(
                    title=r["title"],
                    path=r["path"],
                    title_highlights=title_highlights,
                    content_highlights=content_highlights,
                ))

    return out
Beispiel #35
0
def sample(entry):
    try:
        qp = MultifieldParser(["brewery", "beer"], schema=ix.schema)
        q = qp.parse(entry)
        #q = qp.parse(u'sculpin')
        with ix.searcher() as s:
            results = s.search(q)
            #This is to limit search hits to only hits with beer, brewery, and id
            results = [
                results[r] for r in range(len(results)) if len(results[r]) >= 3
            ]
            liner = []
            ids = []
            for x in range(len(results)):
                be = results[x]['beer']
                br = results[x]['brewery']
                bi = results[x]['beer_id']
                new_line = str(be + ', ' + br)
                ids.append(bi)
                liner.append(new_line)

            lines = []
            for d in range(len(results)):
                ent = {'label': liner[d], 'value': ids[d]}
                lines.append(ent)
            #return '{}'.format(lines)
            return lines
    except Exception as rep:
        return '{}'.format(rep)
    def page(self, page, limit):
        with self.engine.index.searcher() as searcher:
            parser = MultifieldParser(
                self.engine.search_fields,
                schema = self.engine.index.schema,
            )
            parser.add_plugin(GtLtPlugin())
            parser.add_plugin(PhrasePlugin())
            parser.add_plugin(FieldsPlugin())
            #parser.remove_plugin_class(WildcardPlugin)
            #parser.add_plugin(WildcardPlugin())
            parser.add_plugin(PrefixPlugin())

            whoosh_query = parser.parse(self.query.toString(self.engine))
            #print "============" + str(whoosh_query)
            results = searcher.search_page(whoosh_query, page, limit, sortedby = self.order)
            self.rows = results.total

            _results = []

            doc_class = self.engine.database.document

            for result in results:
                doc = doc_class(data = {field: result.get(field, None) for field in self.engine.stored_fields}, restore = True)
                _results.append(doc)

        return _results
Beispiel #37
0
    def search(self, text, scope=None, limit=20):
        """Search from the current index

		Args:
			text (str): String to search for
			scope (str, optional): Scope to limit the search. Defaults to None.
			limit (int, optional): Limit number of search results. Defaults to 20.

		Returns:
			[List(_dict)]: Search results
		"""
        ix = self.get_index()

        results = None
        out = []

        with ix.searcher() as searcher:
            parser = MultifieldParser(["title", "content"], ix.schema)
            parser.remove_plugin_class(FieldsPlugin)
            parser.remove_plugin_class(WildcardPlugin)
            query = parser.parse(text)

            filter_scoped = None
            if scope:
                filter_scoped = Prefix(self.id, scope)
            results = searcher.search(query, limit=limit, filter=filter_scoped)

            for r in results:
                out.append(self.parse_result(r))

        return out
Beispiel #38
0
def MRR(queries, ground, score, ix):

    MRR_table = {}

    fields = ix.schema.names()  # opening the schema fields
    fields.remove(
        'id')  # we are not interested in the 'id' field for the MRR evaluation

    RR_sum = 0

    for query in queries:
        RR = 0
        qp = MultifieldParser(fields, ix.schema)
        parsed_query = qp.parse(queries[query])

        searcher = ix.searcher(weighting=score)
        results = searcher.search(parsed_query, limit=None)

        for doc in results:
            if int(doc['id']) in ground[
                    query]:  # we just stop the loop for the first relevant result
                K = doc.rank + 1  # and we evaluate the single RR
                RR = 1 / K
                break

        RR_sum += RR  # summing the RR for the singular query

    MRR = RR_sum / len(
        ground)  # evaluating the average on the available queries
    return MRR
Beispiel #39
0
    def search(self,
               query_str,
               limit=30,
               html=True,
               description=True,
               comments=False,
               search_comments=True,
               highlight=True):
        index = self._get_index()
        searcher = index.searcher()
        fields = ["summary", "description"]
        if search_comments:
            fields.append("comments_str")
        qp = MultifieldParser(fields, schema=index.schema)
        query = qp.parse(query_str)

        results = searcher.search(query, limit=limit)
        results.formatter = AnsiColorFormatter()
        results.fragmenter = WholeFragmenter()
        self.report(results)

        for hit in results:
            ticket = Ticket.get_by_id(hit["key"])
            text = Issue(ticket.data).to_string(with_description=description,
                                                with_comments=comments)
            if not html:
                text = self._html_to_text.handle(text)
            text = text.strip()
            if highlight and description:
                highlighted = hit.highlights("description", text=text)
                if highlighted:
                    text = highlighted
            self.report(text)
            if description or comments:
                self.report("-" * 80)
Beispiel #40
0
 def search(self, q, tool_name_boost, tool_section_boost,
            tool_description_boost, tool_label_boost, tool_stub_boost,
            tool_help_boost, tool_search_limit):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(weighting=BM25F(
         field_B={
             'name_B': float(tool_name_boost),
             'section_B': float(tool_section_boost),
             'description_B': float(tool_description_boost),
             'labels_B': float(tool_label_boost),
             'stub_B': float(tool_stub_boost),
             'help_B': float(tool_help_boost)
         }))
     # Set query to search name, description, section, help, and labels.
     parser = MultifieldParser(
         ['name', 'description', 'section', 'help', 'labels', 'stub'],
         schema=self.schema)
     # Hyphens are wildcards in Whoosh causing bad things
     if q.find('-') != -1:
         q = (' ').join([token.text for token in self.rex(to_unicode(q))])
     # Perform the search
     hits = searcher.search(parser.parse('*' + q + '*'),
                            limit=float(tool_search_limit))
     return [hit['id'] for hit in hits]
def job_details(request, pk='10'):
    results = []
    ix = open_dir(settings.WHOOSH_INDEX)
    parser = MultifieldParser([
        "jobtitle", "company", "city", "state", "country", "source", "date",
        "JD", "url", "latitude", "longitude", "relative_time"
    ], ix.schema)
    try:
        query = parser.parse("job_id:" + pk)
        print(query)

    except:
        # don't show the user weird errors only because we don't
        # understand the query.
        # parser.parse("") would return None
        query = None
    if query is not None:
        searcher = ix.searcher()
        results = searcher.search(query)
    print(len(results))
    for result in results:
        print(result)
    searchQuery = request.session["searchQuery"]
    return render(request, 'job_details.html', {
        'query': pk,
        'results': results,
        'searchQuery': searchQuery
    })
Beispiel #42
0
	def search(self, query):
		""" general search function for a query string """
		
		hit_docs = []
		index_dir = "D:/bjstinfo_index"		# deprecated. we should use variable or configure file.
		if not os.path.exists(index_dir):
			print "Error: indexer doesn't exist!"
			sys.exit(1)
		ix = index.open_dir(index_dir)
		
		# For keywords query, we search multi-fields of documents as:
		# Title, Keywords, Abstract. give the query-time fieldsboost:
		# {"Title": 1.2, "Keywords": 1.1, "Abstract": 1.0}
		
		query_fields = ['Title', 'Keywords', 'Abstract']
		field_boosts = {'Title':1.2, 'Keywords':1.1, 'Abstract':1.0}
		qp = MultifieldParser(query_fields, schema=ix.schema, fieldboosts=field_boosts)
		q = qp.parse(query)
		with ix.searcher() as s:
			results = s.search(q, limit=50, terms=True)
#			my_cf = ContextFragmenter(maxchars=100, surround=30)	#custome fragmenter.
#			results.fragmenter = my_cf
#			my_score = StandarDeviationScorer(my_cf)	#custome scorer.
#			results.scorer = my_score
#			results.formatter = HtmlFormatter()
			for hit in results:
#				print hit.fields()
				hit_docs.append(hit.fields())
				
				# why just cannot implement the highlight function?
#				print hit.highlights('Abstract', top=20)
		
		return hit_docs
Beispiel #43
0
def indexquery(name, www):
    if name == None:
        return []
    #print("Name: %s" % name)
    ix = index.open_dir("/var/www/restnames/index")
    qp = MultifieldParser([
        "commonname", "database", "tags", "name", "name_part", "country",
        "project", "url"
    ],
                          schema=ix.schema,
                          termclass=FuzzyTerm)
    qp.add_plugin(qparser.FuzzyTermPlugin())
    q = qp.parse(name)
    #q = Every()
    tempvar = []
    with ix.searcher() as searcher:
        results = searcher.search(q, limit=None)
        for hit in results:
            tempvar.append({
                'name': hit["name"],
                'commonname': hit["commonname"],
                'url': hit["url"]
            })
    if not www:
        return tempvar
    else:
        response = Response(
            render_template("searchresults.html", resultlist=tempvar))
        response.headers['content-type'] = 'text/html'
        return response
Beispiel #44
0
    def search(self, queryEntered, pageNum):
        id = list()
        Name = list()
        Genre = list()
        Yearofrelease = list()
        Description = list()
        Rating = list()
        ImdbUrl = list()
        Votes = list()

        with self.indexer.searcher() as search:
            # fileds to be parsed are Name, Description, Genre, and Year of Release
            query = MultifieldParser([
                'Name', 'Description', 'Genre', 'Yearofrelease', 'Rating',
                'ImdbUrl', 'Votes'
            ],
                                     schema=self.indexer.schema,
                                     termclass=Variations)
            query = query.parse(queryEntered)
            results = search.search_page(query, pagenum=pageNum)
            i = 0
            for x in results:
                id.append(i)
                Name.append(x['Name'])
                Description.append(x['Description'])
                Genre.append(x['Genre'])
                Yearofrelease.append(x['Yearofrelease'])
                Rating.append(x['Rating'])
                ImdbUrl.append(x['ImdbUrl'])
                Votes.append(x['Votes'])
                i = i + 1

        return id, Name, Description, Genre, Yearofrelease, Rating, ImdbUrl, Votes
Beispiel #45
0
 def live_search(self, query):
     """live search on ngram field"""
     with self.ix.\
             searcher(weighting=scoring.BM25F(title_B=2)) as searcher:
         qp = MultifieldParser(self.live_search_field + self.search_field,
                               schema=self.ix.schema)
         q = qp.parse(query)
         results = searcher.search(q, limit=25).copy()
         res = {'estimated_length': results.estimated_length(),
                'scored_length': results.scored_length(),
                'runtime': results.runtime,
                'list': []}
         for i, r in enumerate(results):
             if 'id' in r and 'space' in r:
                 url = url_for('document.view', space=r['space'],
                               doc_id=r['id'])
             else:
                 url = None
             res['list'].append({'id': r.get('id', ''),
                                 'space': r.get('space', ''),
                                 'title': r.get('title', ''),
                                 'rank': r.rank,
                                 'url': url,
                                 'score': results.score(i)})
     return res
Beispiel #46
0
    def __call__(self, query, limit=None, fields=None, or_=False):
        if fields is None:
            fields = self._all_fields

        group = OrGroup if or_ else AndGroup
        parser = MultifieldParser(fields, self._index.schema, group=group)
        return self._index.searcher().search(parser.parse(query), limit=limit)
Beispiel #47
0
    def search(self, term):
        qp = MultifieldParser(['name', 'tags'], schema=SCHEMA)
        q = qp.parse(term)

        searcher = self.ix.searcher()
        results = searcher.search(q, limit=None)
        return results
Beispiel #48
0
def getdocs():
    params = dict(request.args.items())
    search_terms = params['NPS'].split(quails.DELIMITER)
    try:
        ix = index.open_dir("indexQ")

    except:
        return jsonify(
            failure=
            "Index not found.  Ensure that index exists and tries again.")

    qp = MultifieldParser(["title", "body"], schema=ix.schema)

    queries = []
    for term in search_terms:
        queries.append(qp.parse(term))

    docs = OrderedDict()
    hit_list = []
    with ix.searcher() as searcher:

        for query in queries:

            results = searcher.search(query)

            for result in results:
                hit_list.append((str(query), result['title']))

    return jsonify(results=hit_list)
Beispiel #49
0
def build_keywords_query(keywords):
    """
    Build parsers for a query.

    :param MultiDict keywords: The search texts keyed by scope key. If empty,
        the query will match every documents.
    """
    queries = []
    if keywords:
        composer = current_app.config['KERKO_COMPOSER']
        text_plugins = [
            plugins.PhrasePlugin(),
            plugins.GroupPlugin(),
            plugins.OperatorsPlugin(
                And=r"(?<=\s)" + re.escape(gettext("AND")) + r"(?=\s)",
                Or=r"(?<=\s)" + re.escape(gettext("OR")) + r"(?=\s)",
                Not=r"(^|(?<=(\s|[()])))" + re.escape(gettext("NOT")) + r"(?=\s)",
                AndNot=None,
                AndMaybe=None,
                Require=None
            ),
            plugins.BoostPlugin(),
        ]
        for key, value in keywords.items(multi=True):
            fields = [spec.key for spec in composer.fields.values() if key in spec.scopes]
            if not fields:
                raise KeyError  # No known field for that scope key.
            parser = MultifieldParser(
                fields, schema=composer.schema, plugins=text_plugins
            )
            queries.append(parser.parse(value))
    else:
        queries.append(Every())
    return And(queries)
Beispiel #50
0
    def search(self, term):
        if not self.index:
            self.load_index()

        parser = MultifieldParser(("body", "title", "tags"), schema=self.schema)
        query = parser.parse(term)
        results = self.searcher.search(query, limit=100)  # , sortedby="date", reverse=True)
        return results
Beispiel #51
0
    def search(self, query, *args, **kwargs):
        parser = MultifieldParser(fieldnames=('content','title','headings','url'), 
                                                    schema=self.ix.schema, 
                                                    fieldboosts={'content':1,'title':2,'headings':3,'url':1})
        qry = parser.parse(query)
        search = self.ix.searcher()
#        with self.ix.searcher() as searcher:
        return search.search_page(qry, *args, **kwargs)
Beispiel #52
0
    def search(self, query_string, index, parser=None, **kwargs):
        index = base._resolve_index(index)
        if parser is None:
            parser = MultifieldParser(fieldnames=index.get_searchable_fieldnames(),
                                      schema=index.get_schema())

        query = parser.parse(query_string)
        return self._search(query, index, **kwargs)
Beispiel #53
0
    def search( self, trans, search_term, page, page_size, boosts ):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        tool_index_dir = os.path.join( trans.app.config.whoosh_index_dir, 'tools' )
        index_exists = whoosh.index.exists_in( tool_index_dir )
        if index_exists:
            index = whoosh.index.open_dir( tool_index_dir )
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                tool_weighting = scoring.BM25F( field_B={
                                                'name_B' : boosts.tool_name_boost,
                                                'description_B' : boosts.tool_description_boost,
                                                'help_B' : boosts.tool_help_boost,
                                                'repo_owner_username_B' : boosts.tool_repo_owner_username_boost } )
                searcher = index.searcher( weighting=tool_weighting )

                parser = MultifieldParser( [
                    'name',
                    'description',
                    'help',
                    'repo_owner_username' ], schema=tool_schema )

                user_query = parser.parse( '*' + search_term + '*' )

                try:
                    hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True )
                except ValueError:
                    raise ObjectNotFound( 'The requested page does not exist.' )

                log.debug( 'searching tools for: #' + str( search_term ) )
                log.debug( 'total hits: ' + str( len( hits ) ) )
                log.debug( 'scored hits: ' + str( hits.scored_length() ) )
                results = {}
                results[ 'total_results'] = str( len( hits ) )
                results[ 'page'] = str( page )
                results[ 'page_size'] = str( page_size )
                results[ 'hits' ] = []
                for hit in hits:
                    hit_dict = {}
                    hit_dict[ 'id' ] = hit.get( 'id' )
                    hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' )
                    hit_dict[ 'repo_name' ] = hit.get( 'repo_name' )
                    hit_dict[ 'name' ] = hit.get( 'name' )
                    hit_dict[ 'description' ] = hit.get( 'description' )
                    results[ 'hits' ].append( {'tool': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } )
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError( 'The search index file is missing.' )
Beispiel #54
0
 def search(self, search_key):
     ix = self.getIndex()
     
     parser = MultifieldParser(["book", "chapter", "verse", "verse_text"], schema=ix.schema)
     query = parser.parse(search_key)
     searcher = ix.searcher()
     result = searcher.search(query, limit=1000)
     
     return self.formatSearchResult(result)
Beispiel #55
0
def search(ix, query_string, sortedby=None, limit=10):
    mp = MultifieldParser(["title", "summary"], schema=ix.schema)
    
    s = ix.searcher()
    keywords = split_keywords(query_string)
    user_q = mp.parse(' OR '.join(keywords))
    # TODO: add query filter
    results = s.search(user_q, sortedby=sortedby, limit=limit)
    return results
Beispiel #56
0
 def search( self, query, return_attribute='id' ):
     # Change field boosts for searcher to place more weight on title, description than help.
     searcher = self.index.searcher( \
                     weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \
                                 ) )
     # Set query to search title, description, and help.
     parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema )
     results = searcher.search( parser.parse( query ) )
     return [ result[ return_attribute ] for result in results ]
Beispiel #57
0
def init():
    # Setting my schema ...
    schema_email = Schema(
        path=TEXT(stored=True),
        sender_email=TEXT(stored=True),
        recipient_emails=TEXT,
        date=DATETIME,
        subject=TEXT(stored=True),
        body=TEXT,
    )
    schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True))
    schemas = {"index_emails": schema_email, "index_book": schema_book}

    if not os.path.exists(index_path):
        os.mkdir(index_path)

    indexes = {}
    for ixname, schema in schemas.items():
        """
        Esta parte es mejorable, ya que sólo indexa si no existe indice. 
        No tiene en cuenta si los archivos indexados se han modificado o si 
        se han eliminado como se explica aquí:
            @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing
        """
        exists = index.exists_in(index_path, indexname=ixname)
        if not exists:
            ix = index.create_in(index_path, schema, indexname=ixname)

            # Indexing ...
            ix = index.open_dir(index_path, indexname=ixname)
            writer = ix.writer()
            if ixname == "index_emails":
                files = read_dir()
                index_emails(files, writer)
            elif ixname == "index_book":
                index_book(writer)
        else:
            ix = index.open_dir(index_path, indexname=ixname)
        indexes[ixname] = ix

    # Main routine
    while True:
        ix = indexes.get("index_emails")
        with ix.searcher() as searcher:
            input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): "))
            mparser = MultifieldParser(["subject", "body"], schema=ix.schema)
            myquery = mparser.parse(unicode(input_user))

            results = searcher.search(myquery)
            print "=================================================="
            for result in results:
                # read_file(result.get("path"))

                print ("Remitente: " + findNameBySender(indexes, result.get("sender_email")))
                print ("Asunto: " + result.get("subject"))
                print "=================================================="
Beispiel #58
0
def search_my_archive(query_str):
    my_index = open_dir(conf.PATH_INDEX_ARCHIVE)
    with my_index.searcher() as searcher:
        mparser = MultifieldParser(['content','retweet'], schema=my_index.schema)
        query = mparser.parse(query_str)
        results = searcher.search(query)
        result_list = [entry['feed_id'] for entry in results]
        with open(conf.PATH_ARCHIVE_JSON,'r') as f:
            feeds = json.loads(f.read())
            return [feed for feed in feeds if str(feed['id']) in result_list]
def answer_query(query):
    with main_index.searcher() as searcher:
        parser = MultifieldParser(['title', 'summary'], main_index.schema, fieldboosts={'title': 5.0, 'summary': 0.2})
        parser.add_plugin(FuzzyTermPlugin())
        # tilde adds fuzzy parsing for 1 character and /1 requires the first letter to match
        query = parser.parse(unicode(query) + '~/1') 
        
        results = searcher.search(query, limit=100)
        tags = [r['tag'] for r in results]
    return tags
Beispiel #60
0
 def term_search(self, query):
     terms = []
     if query.get('term'):
         parser = MultifieldParser(self.term_fields, schema=self.index.schema)
         terms.append(parser.parse(unicode(query.pop('term')[0])))
     for key in query.keys():
         terms.append(Or([ Term(key, unicode(t)) for t in query.pop(key) ]))
     with self.searcher() as searcher:
         for entry in searcher.search(And(terms), limit=None):
             yield entry.fields()