def CreateQueryParser():
    global qp
    og = qparser.OrGroup.factory(0.9)
    qp = qparser.MultifieldParser(["Title", "ParaText"],
                                  schema=ix.schema,
                                  group=og)
    qp.add_plugin(FuzzyTermPlugin())
def searcher(index_path, query):
    ix = open_dir(index_path)
    searcher = ix.searcher()
    parser = QueryParser("content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    my_query = parser.parse(query)
    results = searcher.search(my_query, limit=None)
    for result in results:
        print(result['content'])
Exemple #3
0
    def __init__(self, articles_path):
        """Attempt to initialize a folder with Markdown articles. If a git
        repo, create a search index and populate.

        Markdown Extension References
        * http://facelessuser.github.io/pymdown-extensions
        * https://pythonhosted.org/Markdown/extensions
        """
        self.article_repo = Repo(articles_path)
        self.articles_path = articles_path
        self.markdown_extensions = [
            'markdown.extensions.abbr',
            'markdown.extensions.attr_list',
            'markdown.extensions.def_list',
            'markdown.extensions.fenced_code',
            'markdown.extensions.footnotes',
            'markdown.extensions.tables',
            'markdown.extensions.smart_strong',
            'markdown.extensions.admonition',
            'markdown.extensions.codehilite',
            'markdown.extensions.headerid',
            'markdown.extensions.sane_lists',
            'markdown.extensions.smarty',
            'markdown.extensions.toc',
            'markdown.extensions.wikilinks',
            'pymdownx.betterem',
            'pymdownx.caret',
            'pymdownx.githubemoji',
            'pymdownx.headeranchor',
            'pymdownx.magiclink',
            'pymdownx.mark',
            'pymdownx.smartsymbols',
            'pymdownx.tasklist',
            'pymdownx.tilde',
            'pymdownx.critic',
        ]
        self.markdown_extensions_config = {
            'markdown.extensions.codehilite': {
                'css_class': 'code-highlight'
            }
        }
        self.__search_schema = Schema(
            title=ID(stored=True, unique=True),
            path=ID(stored=True),
            content=TEXT,
        )
        self.__search_parser = MultifieldParser(
            ['title', 'content'],
            schema=self.__search_schema,
        )
        self.__search_parser.add_plugin(FuzzyTermPlugin())
        self.__search_index = self.create_search_index()
        self.populate_search_index()
Exemple #4
0
  def __init__(self, url, headers, rows):
    self.url = url
    self.headers = headers
    self.rows = rows

    self.schema = Schema(
      name=TEXT(stored=False),
      alternative_names=TEXT(stored=False),
      id=ID(stored=True)
    )
    self.index = RamStorage().create_index(self.schema)

    for c in [NAME_HEADER, ALT_NAMES_HEADER, TYPE_HEADER]:
      assert c in self.headers, 'Required "{}" column not found in {}'.format(c, url)

    name_idx = self.headers.index(NAME_HEADER)
    alt_names_idx = self.headers.index(ALT_NAMES_HEADER)

    writer = self.index.writer()
    for idx, row in enumerate(self.rows):
      name = row[name_idx]
      alt_names = row[alt_names_idx]
      writer.add_document(
        name=str(name),
        alternative_names=str(alt_names),
        id=str(idx)
      )
    writer.commit()

    parser = QueryParser("name", self.index.schema)
    self.exact_name_query_parser = parser

    parser = QueryParser("name", self.index.schema)
    parser.add_plugin(FuzzyTermPlugin())
    self.name_query_parser = parser

    parser = QueryParser("alternative_names", self.index.schema)
    parser.add_plugin(FuzzyTermPlugin())
    self.alt_names_query_parser = parser
Exemple #5
0
def search():
    print(request.args)
    search = request.args.get('search')
    author = request.args.get('author')
    category = request.args.get('category')
    page = int(request.args.get(
        'page')) if not request.args.get('page') is None else 1
    print(search)

    if search is None and author is None and category is None:
        myquery = Every()
    else:
        if search is None:
            if not author is None:
                myquery = Term('author', author)
                if not category is None:
                    myquery = myquery & Term('category', category)
            else:
                myquery = Term('category', category)
        else:
            myquery = MultifieldParser(["title", "post_content"],
                                       ix.schema,
                                       plugins=[FuzzyTermPlugin()
                                                ]).parse(search)

            if not author is None:
                myquery = myquery & Term('author', author)

            if not category is None:
                myquery = myquery & Term('category', category)

    with ix.searcher() as searcher:
        results = searcher.search_page(myquery,
                                       page,
                                       pagelen=25,
                                       sortedby="date",
                                       reverse=True)
        print(results.is_last_page())
        results_json = json.dumps(
            {
                "results": [dict(i) for i in results],
                "page": page,
                "total_results": results.total
            },
            default=str)

    resp = Response(response=results_json,
                    status=200,
                    mimetype="application/json")

    return resp
Exemple #6
0
def inicia():
    pth = os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/indiceJuego")
    if not os.path.exists(pth):
        os.mkdir(pth)
        esquemaJuego = Schema(titulo=KEYWORD(stored=True), descripcion=TEXT,
                              categorias=KEYWORD(stored=True), plataformas=KEYWORD(stored=True),
                              precio=NUMERIC(stored=True))
        indiceJuego = create_in("indiceJuego",esquemaJuego)
    else:
        indiceJuego = open_dir(pth)
 
    parser = MultifieldParser(["titulo"], schema=indiceJuego.schema)
    parser.add_plugin(FuzzyTermPlugin())
 
    return indiceJuego,parser
Exemple #7
0
def search_in_index(search_kw, index):
    parser = MultifieldParser(["content", "title"], index.schema)
    parser.add_plugin(FuzzyTermPlugin())
    searcher = index.searcher()
    to_parse = ' '.join([i + '~0' for i in search_kw.split(' ')])
    myquery = parser.parse(to_parse)
    r = searcher.search(myquery)
    results = []
    for res in r:
        results.append(res['path'])
    corrector = searcher.corrector("content")
    suggestions = []
    for kw in search_kw.split(' '):
        suggestions.append(corrector.suggest(kw))
    searcher.close()
    return results, suggestions
Exemple #8
0
def match(query_str, idx, limit=40):
    ret_results = []

    query_words = words_get(query_str)
    if len(query_words) == 0:
        return ret_results

    with idx.searcher() as searcher:
        rome_facet = sorting.FieldFacet('rome')

        # Strict search, with forced correction
        parser = QueryParser('label', idx.schema)
        query = parser.parse(f'{query_str}')
        cor = searcher.correct_query(query, query_str)
        results = searcher.search(cor.query, limit=20, collapse=rome_facet)

        # Word-joker search
        parser = QueryParser('label', idx.schema)
        query = parser.parse(f'{query_str}*')
        results_partial = searcher.search(query, limit=20, collapse=rome_facet)
        results.upgrade_and_extend(results_partial)

        # Fuzzy search
        parser = QueryParser('label', idx.schema, termclass=CustomFuzzyTerm)
        parser.add_plugin(FuzzyTermPlugin())

        shortword = re.compile(r'\W*\b\w{1,3}\b')
        query_prep = shortword.sub('', query_str)
        query = parser.parse(query_prep)
        results_fuzzy = searcher.search(query,
                                        limit=limit,
                                        collapse=rome_facet)

        results.upgrade_and_extend(results_fuzzy)
        for res in results:
            ret_results.append({
                'id': res['rome'],
                'label': res['label'],
                'value': res['label'],
                'occupation': res['slug'],
                'source': res['source'],
                'score': res.score
            })

    return sorted(ret_results, key=lambda e: e['score'], reverse=True)
    def get_schema(self):
        schema_dir = os.path.join(cache_dir, self.id_)
        os.makedirs(schema_dir, exist_ok=True)

        if exists_in(schema_dir) and open_dir(schema_dir).doc_count() != 0:
            self.ix = open_dir(schema_dir)
            print('Existing index cache found. Loaded {} tree nodes. Hooray!'.
                  format(self.ix.doc_count()))

        else:
            print('No valid cache found. Building indexes...')
            now = time.time()
            self.__build_whoosh_index(schema_dir)
            print('Finished in {:.2f} seconds'.format(time.time() - now))

        self.parser = MultifieldParser(self.ix.schema.names(),
                                       schema=self.ix.schema)
        self.parser.add_plugin(FuzzyTermPlugin())
Exemple #10
0
def test_fuzzy_prefix():
    from whoosh import scoring

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT(spelling=True))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        # Match -> first
        w.add_document(title=u("First"),
                       content=u("This is the first document we've added!"))
        # No match
        w.add_document(
            title=u("Second"),
            content=u("The second one is even more interesting! filst"))
        # Match -> first
        w.add_document(title=u("Third"),
                       content=u("The world first line we've added!"))
        # Match -> zeroth
        w.add_document(
            title=u("Fourth"),
            content=u("The second one is alaways comes after zeroth!"))
        # Match -> fire is within 2 edits (transpose + delete) of first
        w.add_document(title=u("Fifth"), content=u("The fire is beautiful"))

    from whoosh.qparser import QueryParser, FuzzyTermPlugin
    parser = QueryParser("content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    q = parser.parse("first~2/3 OR zeroth", debug=False)

    assert isinstance(q, query.Or)
    ft = q[0]
    assert isinstance(ft, query.FuzzyTerm)
    assert ft.maxdist == 2
    assert ft.prefixlength == 3

    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        results = searcher.search(q)
        assert len(results) == 4
        assert (" ".join(sorted(
            hit["title"] for hit in results)) == "Fifth First Fourth Third")
Exemple #11
0
    def __init__(self, location):
        """
        create a new redis store, the location given will be used to generate keys

        this keys will be combined to get/set instance config

        Args:
            location (Location)
        """
        super().__init__(location, Serializer())
        config = self.config_env.get_store_config("whoosh")
        self.base_index_path = config["path"]

        self.schema = self.get_schema()
        self.index = self.get_index(self.schema)

        self.default_plugins = [
            FuzzyTermPlugin(), GtLtPlugin(),
            PhrasePlugin()
        ]
        self.default_pagenum = 1
        self.default_pagelen = 20
Exemple #12
0
def search_in_index(search_kw, index):
    '''
    search_kw: ce que rentre l'utilisateur dans la barre de recherche
    index: l'index ouvert (objet ix dans le code qui précède)
    
    La fonction renvoie un dictionnaire avec pour clefs:
        - results: une liste contenant des dictionnaires. Chaque dictionnaire 
        correspond à un résultat de recherche. Le premier élément de la liste 
        est le meilleur résultat. Les dictionnaires on deux clefs: 'title' avec 
        le titre du doc, et 'path' avec le chemin (vers le doc texte, pour le moment)
        - suggestions: dictionnaire de suggestion qui propose une éventuelle 
        correction pour chaque mot entré par l'utilisateur. A voir comment on 
        mélange les suggestions des différents mots pour fournir des suggestions 
        complètes
    '''
    #on utilise un MultifieldParser pour rechercher à la fois dans le titre et dans le contenu
    parser = MultifieldParser(["content", "title"], index.schema)
    #on rajoute un plugin de FuzzyMatching pour pouvoir chercher au delà des mots exacts
    parser.add_plugin(FuzzyTermPlugin())
    searcher = index.searcher()
    #on transforme la requête utilisateur pour la mettre en format compréhensible par le plugin de FuzzyMatching
    to_parse = ' '.join([i + '~1' for i in search_kw.split(' ')])
    myquery = parser.parse(to_parse)
    #on récupère les résultats pour pouvoir fermer le searcher  par la suite
    r = searcher.search(myquery)
    results = []
    for res in r:
        results.append({'title': res['title'], 'path': res['path']})
    #on set-up le correcteur et on stock ce qu'il propose pour chaque mot tapé
    corrector = searcher.corrector("content")
    suggestions = {}
    for kw in search_kw.split(' '):
        suggestions[kw] = corrector.suggest(kw)
    #on ferme le seacher
    searcher.close()
    return {'results': results, 'suggestions': suggestions}
Exemple #13
0
    id = Column(Integer, primary_key=True)
    title = Column(Text)
    body = Column(UnicodeText)
    created = Column(DateTime, default=datetime.datetime.utcnow())

    def __repr__(self):
        return '{0}(title={1})'.format(self.__class__.__name__,
                                       self.title)


Base.metadata.create_all(engine)

index_subscriber = IndexSubscriber(session=session, whoosh_base_path='index')
index_subscriber.subscribe(Post)

p1 = Post(title='love barcelona', body='it is the best city in the world even before madrid!')
p2 = Post(title='love madrid', body='it is the second best city in the world after barcelona!')
session.add_all([p1, p2])
session.commit()

# normal search, this does not keep whoosh score
Post.whoosh.search('barcelona').all()
Post.whoosh.search('madrid').all()

# ordered result based on whoosh score
results = Post.whoosh.search_all_ordered('madrid')
results = Post.whoosh.search_all_ordered('barcelona')

from whoosh.qparser import FuzzyTermPlugin
Post.whoosh.search_all_ordered('baarcelonaa~2', plugin=FuzzyTermPlugin())
def main():
    """ The main loop for the program """
    g = Gui()
    ix = index.open_dir("indexdir")

    while True:
        event, values = g.window.read()
        g.window["_output_"]('')

        # close windows
        if event is None:
            break

        if event == '_SEARCH_' and values['TERM'] is not None:

            # il parametro 'fieldboosts' regola quanta importanza dare ai match nei vari campi
            qp = MultifieldParser(
                ["procTitle", "topics", "categories", "procContent"],
                termclass=Variations,
                schema=ix.schema,
                fieldboosts={
                    "procTitle": 1.5,
                    "categories": 1.3
                })
            qp.add_plugin(FuzzyTermPlugin())

            terms = str(values['TERM'])
            terms = terms.replace("title", "procTitle").replace("topic", "topics") \
                 .replace("category", "categories").replace("content", "procContent")

            # Modifica della query immessa con aggiunta dei sinonimi nel caso l'opzione sia abilitata, con attenzione
            # al riportare i token booleani senza modifiche ed a tradurre correttamente la definizione dei campi in cui
            # ricercare i termini se richiesti.
            if values['syn_search']:
                with open("utils/wn_s.pl", "r") as f:
                    thesaurus = Thesaurus.from_file(f)
                termsWithSynonyms = []
                for term in terms.split(" "):
                    field = None
                    if ":" in term:
                        field = term.split(":")[0]
                        term = term.split(":")[1]
                    if term not in booleanTokens:
                        termSynonyms = thesaurus.synonyms(term)
                        if field is not None:
                            termSynonyms = [
                                f"{field}:{t}" for t in termSynonyms
                            ]
                            termSynonyms.append(f"{field}:{term}")
                        else:
                            termSynonyms.append(term)
                        termsWithSynonyms.append(" OR ".join(termSynonyms))
                    else:
                        termsWithSynonyms.append(term)
                terms = ' '.join(termsWithSynonyms)

            print("- Searching for >>> " + str(terms))

            # stemming dei termini della query e aggiunta della tilde per ricerca "fuzzy" a quelle effettivamente modificate
            words = terms.split(' ')
            stemmedWords = list()
            for word in words:
                stemmed = stem(word)
                if word != stemmed:
                    stemmedWords.append(stemmed + '~')
                else:
                    stemmedWords.append(stemmed)

            q = qp.parse(' '.join(stemmedWords))

            with ix.searcher() as searcher:
                if not values['syn_search']:
                    correction = searcher.correct_query(q=q,
                                                        qstring=terms,
                                                        maxdist=2)
                    if terms != correction.string:
                        print("- Did you mean >>> " + correction.string)
                results = searcher.search(q, terms=True)

                if not values['syn_search'] and results.is_empty():
                    print(
                        "- No relevant result has been found for query, trying corrected query"
                    )
                    results = searcher.search(qp.parse(correction.string))

                numb = 1
                if not results.is_empty():
                    for elem in results:
                        # print(elem)
                        print(
                            f"Result n.{numb} >>> Title: {str(elem['docTitle'])}\n\tScore: {str(elem.score)}\n"
                            f"\tLink to the page: {str(elem['pageUrl'])}\n")
                        numb += 1
                else:
                    print("- No relevant result has been found")
Exemple #15
0
def _criar_parser():
    parser = QueryParser('nome', schema)
    parser.add_plugin(FuzzyTermPlugin())
    return parser
Exemple #16
0
 def test_plugins_search(self):
     results = self.Post.whoosh.search_all_ordered('baarcelonaa~2',
                                                   plugin=FuzzyTermPlugin())
     self.assertEqual(len(results), 2)
     self.assertEqual(results[0].id, 1)
     self.assertEqual(results[1].id, 2)
Exemple #17
0
# coding=utf-8
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser import FuzzyTermPlugin

idx_dir = 'lagou_idx'
ix = open_dir(idx_dir)
searcher = ix.searcher()

parser = MultifieldParser(["name", "desc"], schema=ix.schema)
parser.add_plugin(FuzzyTermPlugin())

# Single field parser.
k = u'搜索 OR Pythn~2 city:上海'
q = parser.parse(k)

results = searcher.search_page(q, 1, pagelen=5)

print(u'{0} results found for keyword {1}, {2} returned: '.format(
    len(results), k, results.scored_length()))
for hit in results[:50]:
    print(hit['id'])
    print(hit['name'])
    # print(hit['city'])
    print(hit['com_name'])
    print('************')