Beispiel #1
0
def find(criteria, reindex=False):
    """
    Search for Azure CLI commands
    :param str criteria: Query text to search for.
    :param bool reindex: Clear the current index and reindex the command modules.
    :return:
    :rtype: None
    """
    if reindex:
        _create_index()

    ix = _get_index()
    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=schema
    )

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        q = qp.parse(" OR ".join(criteria))

    with ix.searcher() as searcher:
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
Beispiel #2
0
def find(cmd, criteria, reindex=False):
    from whoosh.qparser import MultifieldParser
    if reindex:
        _create_index(cmd.cli_ctx)

    try:
        ix = _get_index(cmd.cli_ctx)
    except ValueError:
        # got a pickle error because the index was written by a different python version
        # recreate the index and proceed
        _create_index(cmd.cli_ctx)
        ix = _get_index(cmd.cli_ctx)

    qp = MultifieldParser(
        ['cmd_name', 'short_summary', 'long_summary', 'examples'],
        schema=_get_schema())

    if 'OR' in criteria or 'AND' in criteria:
        # looks more advanced, let's trust them to make a great query
        q = qp.parse(" ".join(criteria))
    else:
        # let's help out with some OR's to provide a less restrictive search
        expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria)
        q = qp.parse(expanded_query)

    with ix.searcher() as searcher:
        from whoosh.highlight import UppercaseFormatter, ContextFragmenter
        results = searcher.search(q)
        results.fragmenter = ContextFragmenter(maxchars=300, surround=200)
        results.formatter = UppercaseFormatter()
        for hit in results:
            _print_hit(hit)
Beispiel #3
0
def query_whoosh(whoosh_dir,
                 queries,
                 weighting=scoring.BM25F(),
                 num_results=50):
    res_sets = []
    # Weighting used for ranking documents
    ix = index.open_dir(whoosh_dir)

    # Examine effect of scoring on queries for key terms (and key terms themselves)

    # Highlight search term in results by making them UPPER CASE
    formatter = UppercaseFormatter()

    # Run queries and print results
    for q in queries:  # "new york", "empire state building", "oculus",
        cur = []
        with ix.searcher(weighting=weighting) as searcher:
            query = QueryParser("body", ix.schema).parse(q)
            results = searcher.search(query, limit=num_results)
            results.formatter = formatter
            print_header("Query:   {}   returned {} results for {}".format(
                q, len(results), str(weighting)))
            # if print_results:
            for i, result in enumerate(results):
                cur.append(result['url'].replace('index.txt', ''))
                print_result(i, result)
                print()
        res_sets.append(set(cur))
    return res_sets
 def search(self, term):
     results = []
     suggestions = [term] + (self.corrector.suggest(term, limit=5))
     for t in suggestions:
         query = self.parser.parse(t)
         query_res = self.searcher.search(query, limit=100)
         query_res.fragmenter.maxchars = 300
         query_res.fragmenter.surround = 100
         query_res.formatter = UppercaseFormatter()
         results.append((t, query_res))
     return results
Beispiel #5
0
def search_docstrings(ix, query_string, verbose=0):
    with ix.searcher() as searcher:
        query = QueryParser('doc', ix.schema).parse(query_string)
        results = searcher.search(query)
        if verbose > 0:
            results.formatter = UppercaseFormatter()
        for r in results:
            print 'Result %d (%g): %s' % (r.rank + 1, r.score, r['name'])
            if verbose == 2:
                print r['doc'], '\n'
            elif verbose == 1:
                print r.highlights('doc'), '\n'
Beispiel #6
0
def testHighlights(autor):
    """
    Busca los fragmentos resaltados a los que pertenecen los términos buscados
    """
    ix = whoosh.index.open_dir("ficheros/index")
    parser = QueryParser("descripcion", ix.schema)
    myquery = parser.parse(autor)
    highs = []
    with ix.searcher() as searcher:
        results = searcher.search(myquery)
        results.formatter = UppercaseFormatter()
        for hit in results:
            if hit["descripcion"] is not None:
                resalto = hit.highlights("descripcion")
                highs.append(resalto)
            else:
                print "No hay descripcion del cuadro"
    return highs
def query_whoosh(whoosh_dir, num_results=5):
    ix = index.open_dir(whoosh_dir)

    # Examine effect of scoring on queries for key terms (and key terms themselves)

    # Highlight search term in results by making them UPPER CASE
    formatter = UppercaseFormatter()

    # Weighting used for ranking documents
    weighting = scoring.BM25F()
    
    # Run queries and print results
    for q in ["new york", "empire state building", "oculus"]:
        with ix.searcher(weighting=weighting) as searcher:
            query = QueryParser("body", ix.schema).parse(q)
            results = searcher.search(query, limit=num_results)
            results.formatter = formatter
            print_header("Query:   {}   returned {} results".format(q, len(results)))
            for result in results:
                print_result(result)
                print()
Beispiel #8
0
def highlights(palabra, numResultados=None):
    """
    Busca los fragmentos resaltados a los que pertenecen los términos buscados
    """
    ix = whoosh.index.open_dir(indexPath)
    parser = QueryParser("descripcion", ix.schema)
    myquery = parser.parse(palabra)
    highs = []
    with ix.searcher() as searcher:
        results = searcher.search(myquery, limit=numResultados)
        results.formatter = UppercaseFormatter()
        for hit in results:
            print hit
            if hit["descripcion"] is not None:
                resalto = hit.highlights("descripcion")
                cuadro = hitToDict(hit)
                cuadro["highlight"] = resalto
                highs.append(cuadro)
                #print resalto
            else:
                print "No hay descripcion del cuadro"
    return highs
Beispiel #9
0
    def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True),
                             body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: {0}".format(e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"],
                                                     schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
Beispiel #10
0
 def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None):
     from haystack import site
     results = []
     
     # It's important to grab the hits first before slicing. Otherwise, this
     # can cause pagination failures.
     hits = len(raw_page)
     
     facets = {}
     spelling_suggestion = None
     indexed_models = site.get_indexed_models()
     
     for doc_offset, raw_result in enumerate(raw_page):
         score = raw_page.score(doc_offset) or 0
         app_label, model_name = raw_result['django_ct'].split('.')
         additional_fields = {}
         model = get_model(app_label, model_name)
         
         if model and model in indexed_models:
             for key, value in raw_result.items():
                 index = site.get_index(model)
                 string_key = str(key)
                 
                 if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                     # Special-cased due to the nature of KEYWORD fields.
                     if isinstance(index.fields[string_key], MultiValueField):
                         if value is None or len(value) is 0:
                             additional_fields[string_key] = []
                         else:
                             additional_fields[string_key] = value.split(',')
                     else:
                         additional_fields[string_key] = index.fields[string_key].convert(value)
                 else:
                     additional_fields[string_key] = self._to_python(value)
             
             del(additional_fields['django_ct'])
             del(additional_fields['django_id'])
             
             if highlight:
                 from whoosh import analysis
                 from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                 sa = analysis.StemmingAnalyzer()
                 terms = [term.replace('*', '') for term in query_string.split()]
                 
                 additional_fields['highlighted'] = {
                     self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                 }
             
             result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
             results.append(result)
         else:
             hits -= 1
     
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
         if spelling_query:
             spelling_suggestion = self.create_spelling_suggestion(spelling_query)
         else:
             spelling_suggestion = self.create_spelling_suggestion(query_string)
     
     return {
         'results': results,
         'hits': hits,
         'facets': facets,
         'spelling_suggestion': spelling_suggestion,
     }
Beispiel #11
0
    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [
                        term.replace('*', '') for term in query_string.split()
                    ]

                    additional_fields['highlighted'] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms, sa, ContextFragmenter(terms),
                                UppercaseFormatter())
                        ],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
Beispiel #12
0
writer.commit()

# Alow user to search
from whoosh.qparser import QueryParser
from whoosh.highlight import UppercaseFormatter

with ix.searcher() as searcher:
    print('Enter search phrase:', end=' ')
    querystring = input()

    parser = QueryParser('content', ix.schema)
    query = parser.parse(querystring)
    results = searcher.search(query)

    results.formatter = UppercaseFormatter()
    def prCyan(skk): print("\033[96m{}\033[00m" .format(skk))
 
    for hit in results:
        with open(hit['path']) as f:
            contents = f.read()
       
        fragment = hit.highlights('content', text=contents)
        print('\n' + fragment)
        prCyan(hit['path'])
        


        

Beispiel #13
0
    def searchdblp(self, userquery):
        documentNumber = 1
        finalresults = {
            "searchTerm": userquery,
            "searchquerydocuments": [],
            "relatedquerydocuments": []
        }

        similarwords = getSimilarWords(
            userquery.replace('"', '').replace(" ", ""))
        queries = []
        queries.append(userquery)
        for similarword in similarwords:
            queries.append(similarword[0])

        with self.ix.searcher() as searcher:
            queryparser = QueryParser("content", schema=self.ix.schema)
            for queryString in queries:
                parsedquery = queryparser.parse(queryString)

                if queryString == queries[0]:
                    corrected = searcher.correct_query(parsedquery,
                                                       queryString)
                    print(corrected.string)
                    if corrected.query != parsedquery:
                        print("Did you mean:", corrected.string)

                results = searcher.search(parsedquery)
                results.formatter = UppercaseFormatter(between="~")

                for hit in results:
                    filename = hit["path"] + ".json"
                    filepath = os.path.join(self.dataDirPath, filename)
                    with open(filepath, "r", encoding="utf-8") as f:
                        subresult = {}
                        jsonfile = json.load(f)
                        filecontents = jsonfile['abstract']
                        pagerank = jsonfile['pagerank']
                        category = jsonfile['category']
                        subresult = {
                            "title":
                            hit['title'],
                            "path":
                            "http://127.0.0.1:5000/file/" + hit["path"],
                            "highlights":
                            ' ... '.join([
                                x.replace("\n", " ") for x in ("".join(
                                    hit.highlights("content",
                                                   filecontents)).split("~"))
                            ]),
                            "pagerank":
                            pagerank,
                            "category":
                            category
                        }
                        documentNumber += 1
                        if (queryString == userquery):
                            finalresults["searchquerydocuments"].append(
                                subresult)
                        else:
                            finalresults["relatedquerydocuments"].append(
                                subresult)
        return json.dumps(finalresults, indent=4)