Beispiel #1
0
    def delete_by_query(self, q, searcher=None):
        """Deletes any documents matching a query object.
        
        Note that this method opens and closes a Searcher. If you are calling
        this method repeatedly (for example, deleting changed documents before
        reindexing them), you should open your own Searcher object and
        pass it in with the 'searcher' keyword argument for efficiency.
        
        :*returns*: the number of documents deleted.
        """

        if searcher is None:
            from whoosh.searching import Searcher
            s = Searcher(self)
        else:
            s = searcher

        count = 0
        try:
            for docnum in q.docs(s):
                self.delete_document(docnum)
                count += 1
            return count

        finally:
            if searcher is None:
                s.close()

        return count
Beispiel #2
0
    def test_vector_merge(self):
        a = analysis.StandardAnalyzer()
        schema = fields.Schema(
            title=fields.TEXT,
            content=fields.TEXT(vector=formats.Frequency(analyzer=a)))
        ix = self.make_index("testindex", schema, "vector_merge")
        try:
            writer = ix.writer()
            writer.add_document(
                title=u"one",
                content=u"This is the story of the black hole story")
            writer.commit()

            writer = ix.writer()
            writer.add_document(title=u"two",
                                content=u"You can read along in your book")
            writer.commit()

            reader = ix.reader()
            searcher = Searcher(reader)

            docnum = searcher.document_number(title=u"one")
            vec = list(reader.vector(docnum, "content").items_as("frequency"))
            self.assertEqual(vec, [(u'black', 1), (u'hole', 1), (u'story', 2)])

            docnum = searcher.document_number(title=u"two")
            vec = list(searcher.vector_as("frequency", docnum, "content"))
            self.assertEqual(vec, [(u'along', 1), (u'book', 1), (u'read', 1),
                                   (u'your', 1)])
        finally:
            pass
Beispiel #3
0
 def delete_by_query(self, q, searcher = None):
     """Deletes any documents matching a query object.
     
     Note that this method opens and closes a Searcher. If you are calling
     this method repeatedly (for example, deleting changed documents before
     reindexing them), you should open your own Searcher object and
     pass it in with the 'searcher' keyword argument for efficiency.
     
     :*returns*: the number of documents deleted.
     """
     
     if searcher is None:
         from whoosh.searching import Searcher
         s = Searcher(self)
     else:
         s = searcher  
     
     count = 0
     try:
         for docnum in q.docs(s):
             self.delete_document(docnum)
             count += 1
         return count
     
     finally:
         if searcher is None:
             s.close()
     
     return count
 def test_vector_merge(self):
     a = analysis.StandardAnalyzer()
     schema = fields.Schema(title = fields.TEXT,
                            content = fields.TEXT(vector=formats.Frequency(analyzer=a)))
     ix = self.make_index("testindex", schema, "vector_merge")
     try:
         writer = ix.writer()
         writer.add_document(title=u"one",
                             content=u"This is the story of the black hole story")
         writer.commit()
         
         writer = ix.writer()
         writer.add_document(title=u"two",
                             content=u"You can read along in your book")
         writer.commit()
         
         reader = ix.reader()
         searcher = Searcher(reader)
         
         docnum = searcher.document_number(title=u"one")
         vec = list(reader.vector(docnum, "content").items_as("frequency"))
         self.assertEqual(vec, [(u'black', 1), (u'hole', 1), (u'story', 2)])
         
         docnum = searcher.document_number(title=u"two")
         vec = list(searcher.vector_as("frequency", docnum, "content"))
         self.assertEqual(vec, [(u'along', 1), (u'book', 1), (u'read', 1), (u'your', 1)])
     finally:
         pass
Beispiel #5
0
def test2():
    queries = load_queries()

    ix = index.open_dir(index_dir)
    mysearcher = Searcher(ix)
    #mysearcher = ix.searcher(weighting=scoring.BM25F())
    for query in queries:
        print("Processing query number", query['id'])
        results = mysearcher.search(query['text'], limit=10)
        print(results)
def test2():
    queries = load_queries()

    ix = index.open_dir(index_dir)
    mysearcher = Searcher(ix)
    #mysearcher = ix.searcher(weighting=scoring.BM25F())
    for query in queries:
        print("Processing query number", query['id'])
        results = mysearcher.search(query['text'], limit=10)
        print(results)
Beispiel #7
0
def test_memory_codec():
    from whoosh.codec import memory
    from whoosh.searching import Searcher

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(a=fields.TEXT(vector=True),
                           b=fields.STORED,
                           c=fields.NUMERIC(stored=True, sortable=True),
                           d=fields.TEXT(analyzer=ana, spelling=True))

    codec = memory.MemoryCodec()
    with codec.writer(schema) as w:
        w.add_document(a=u("alfa bravo charlie"), b="hello", c=100,
                       d=u("quelling whining echoing"))
        w.add_document(a=u("bravo charlie delta"), b=1000, c=200,
                       d=u("rolling timing yelling"))
        w.add_document(a=u("charlie delta echo"), b=5.5, c=300,
                       d=u("using opening pulling"))
        w.add_document(a=u("delta echo foxtrot"), b=True, c=-100,
                       d=u("aching selling dipping"))
        w.add_document(a=u("echo foxtrot india"), b=None, c=-200,
                       d=u("filling going hopping"))

    reader = codec.reader(schema)
    s = Searcher(reader)

    assert ("a", "delta") in reader
    q = query.Term("a", "delta")
    r = s.search(q)
    assert len(r) == 3
    assert [hit["b"] for hit in r] == [1000, 5.5, True]

    assert (" ".join(s.field_terms("a"))
            == "alfa bravo charlie delta echo foxtrot india")

    cfield = schema["c"]
    c_sortables = cfield.sortable_terms(reader, "c")
    c_values = [cfield.from_bytes(t) for t in c_sortables]
    assert c_values, [-200, -100, 100, 200, 300]

    assert reader.has_column("c")
    c_values = list(reader.column_reader("c"))
    assert c_values == [100, 200, 300, -100, -200]

    assert s.has_vector(2, "a")
    v = s.vector(2, "a")
    assert " ".join(v.all_ids()) == "charlie delta echo"

    assert reader.has_word_graph("d")
    gr = reader.word_graph("d")
    assert (" ".join(gr.flatten_strings()) ==
            "aching dipping echoing filling going hopping opening "
            "pulling quelling rolling selling timing using whining "
            "yelling")
def get_index_docnum_of_article_id(article_id: str,
                                   ix_reader: MultiReader,
                                   article_id_fieldname='articleID'):
    ix_searcher = Searcher(ix_reader)
    rslts = ix_searcher.search(
        QueryParser(article_id_fieldname, ix_reader.schema).parse(article_id))
    if len(rslts) == 0:
        LOGGER.warning(
            'Article ID {} was not found in the index'.format(article_id))
        return -1
    if len(rslts) > 1:
        LOGGER.warning(
            'Article ID {} has multiple instances in the index'.format(
                article_id))
    return rslts[0].docnum
Beispiel #9
0
    def __init__(self, index, page_ranker):
        """
        Creazione del QueryParser relativo al testo della query.
        Aggiungo il plugin 'MultifieldPlugin' al 'QueryParser' perchè mi permette poi nella funzione di 
        'search' di poter modificare il filed boost (cosa che non siamo riusciti a fare se avessimo
        definito direttamente un 'MultifieldParser').

        - MULTIFIELD così effettuo ricerca sia nel titolo che nel testo.
                    Il parser utilizza l'analizzatore definito nell'index, riferito al field corrispondente.
                    Se ho definito 2 analyzer diversi per titolo e testo, questi vengono
                    usati rispettivamente per parsare la query per il titolo e per il testo.
                    ES: testo con stemmer e titolo senza stemmer:
                        query: 'fortified' --> query parsata :'(text:fortifi OR title:fortified)'
                             
        - GROUP default concatena i token con 'AND'. Specificando 'OrGroup' concatena con OR.
                Utilizzando il FACTORY, do un punteggio maggiore ai documenti in cui un certo termine
                ha una frequenza più alta. Senza FACTORY non ho questo effetto.     
        """
        self.index = index

        self.page_ranker = page_ranker

        self.expand = Expander(disambiguate_fn='noun_sense')

        self.parser = qparser.QueryParser(None, index.schema)
        self.multifield_plugin = qparser.MultifieldPlugin(['text', 'title'])
        self.parser.add_plugin(self.multifield_plugin)

        self.weighting = 'BM25F'
        self.searcher = WhooshSearcher(
            reader=self.index.reader(),
            weighting=WikiSearcher.weighting[self.weighting])
Beispiel #10
0
    def searcher(self, **kwargs):
        """Returns a Searcher object for this index. Keyword arguments
        are passed to the Searcher object's constructor.
        
        :*returns*: searching.Searcher
        """

        from whoosh.searching import Searcher
        return Searcher(self, **kwargs)
Beispiel #11
0
    def searcher(self, **kwargs):
        """Returns a Searcher object for this index. Keyword arguments are
        passed to the Searcher object's constructor.
        
        :rtype: :class:`whoosh.searching.Searcher`
        """

        from whoosh.searching import Searcher
        return Searcher(self.reader(), fromindex=self, **kwargs)
Beispiel #12
0
 def _parse_and_search(
         self,
         searcher: Searcher,
         content: str,
         value: str,
         limit: Optional[int] = None,
         terms: bool = False,
         group: Type[Union[AndGroup, OrGroup]] = OrGroup) -> Results:
     parser = QueryParser(content, self._ix.schema, group=group)
     parser.remove_plugin_class(FieldsPlugin)
     parser.remove_plugin_class(WildcardPlugin)
     query = parser.parse(value)
     return searcher.search(query, terms=terms, limit=limit)
Beispiel #13
0
 def test_all(self):
     domain = [u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot"]
     schema = Schema(text=TEXT)
     storage = RamStorage()
     ix = storage.create_index(schema)
     w = ix.writer()
     for _ in xrange(100):
         w.add_document(text=u" ".join(choice(domain) for i in xrange(randint(10, 20))))
     w.commit()
     
     # provide initializer arguments for any weighting classes that require them
     init_args = {}
     
     reader = ix.reader()
     for wclass in self._weighting_classes():
         if wclass in init_args:
             weighting = wclass(*init_args[wclass])
         else:
             weighting = wclass()
         searcher = Searcher(reader, weighting)
         
         for word in domain:
             r = searcher.search(query.Term("text", word))
Beispiel #14
0
def test_memory_codec():
    from whoosh.codec import memory
    from whoosh.searching import Searcher

    ana = analysis.StemmingAnalyzer()
    schema = fields.Schema(a=fields.TEXT(vector=True),
                           b=fields.STORED,
                           c=fields.NUMERIC(stored=True, sortable=True),
                           d=fields.TEXT(analyzer=ana, spelling=True))

    codec = memory.MemoryCodec()
    with codec.writer(schema) as w:
        w.add_document(a=u("alfa bravo charlie"),
                       b="hello",
                       c=100,
                       d=u("quelling whining echoing"))
        w.add_document(a=u("bravo charlie delta"),
                       b=1000,
                       c=200,
                       d=u("rolling timing yelling"))
        w.add_document(a=u("charlie delta echo"),
                       b=5.5,
                       c=300,
                       d=u("using opening pulling"))
        w.add_document(a=u("delta echo foxtrot"),
                       b=True,
                       c=-100,
                       d=u("aching selling dipping"))
        w.add_document(a=u("echo foxtrot india"),
                       b=None,
                       c=-200,
                       d=u("filling going hopping"))

    reader = codec.reader(schema)
    s = Searcher(reader)

    assert ("a", "delta") in reader
    q = query.Term("a", "delta")
    r = s.search(q)
    assert len(r) == 3
    assert [hit["b"] for hit in r] == [1000, 5.5, True]

    assert (" ".join(
        s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india")

    cfield = schema["c"]
    c_sortables = cfield.sortable_terms(reader, "c")
    c_values = [cfield.from_bytes(t) for t in c_sortables]
    assert c_values, [-200, -100, 100, 200, 300]

    assert reader.has_column("c")
    c_values = list(reader.column_reader("c"))
    assert c_values == [100, 200, 300, -100, -200]

    assert s.has_vector(2, "a")
    v = s.vector(2, "a")
    assert " ".join(v.all_ids()) == "charlie delta echo"
Beispiel #15
0
    def test_missing_field_scoring(self):
        schema = fields.Schema(name=fields.TEXT(stored=True),
                               hobbies=fields.TEXT(stored=True))
        storage = RamStorage()
        idx = storage.create_index(schema)
        writer = idx.writer()
        writer.add_document(name=u'Frank', hobbies=u'baseball, basketball')
        writer.commit()
        self.assertEqual(idx.segments[0].field_length(0), 2)  # hobbies
        self.assertEqual(idx.segments[0].field_length(1), 1)  # name

        writer = idx.writer()
        writer.add_document(name=u'Jonny')
        writer.commit()
        self.assertEqual(len(idx.segments), 1)
        self.assertEqual(idx.segments[0].field_length(0), 2)  # hobbies
        self.assertEqual(idx.segments[0].field_length(1), 2)  # name

        reader = idx.reader()
        searcher = Searcher(reader)
        parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema)
        q = parser.parse(u"baseball")
        result = searcher.search(q)
        self.assertEqual(len(result), 1)
 def test_missing_field_scoring(self):
     schema = fields.Schema(name=fields.TEXT(stored=True),
                            hobbies=fields.TEXT(stored=True))
     storage = RamStorage()
     idx = storage.create_index(schema)
     writer = idx.writer() 
     writer.add_document(name=u'Frank', hobbies=u'baseball, basketball')
     writer.commit()
     self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies
     self.assertEqual(idx.segments[0].field_length(1), 1) # name
     
     writer = idx.writer()
     writer.add_document(name=u'Jonny') 
     writer.commit()
     self.assertEqual(len(idx.segments), 1)
     self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies
     self.assertEqual(idx.segments[0].field_length(1), 2) # name
     
     reader = idx.reader()
     searcher = Searcher(reader)
     parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema)
     q = parser.parse(u"baseball")
     result = searcher.search(q)
     self.assertEqual(len(result), 1)
Beispiel #17
0
 def __init__(self,
              file_index: index.FileIndex,
              index_docnums: list = None,
              name: str = 'DB',
              ix_reader: IndexReader = None,
              content_field='body'):
     self.name = name
     self.ix = file_index
     self._reader = ix_reader if ix_reader is not None else file_index.reader(
     )
     self._private_reader = False if ix_reader is not None else True
     self._searcher = Searcher(self._reader)
     if index_docnums is not None:
         self._docnums = set(index_docnums)
     else:
         self._docnums = self._get_all_db_ids()
     self._dfs, self._tfs, self._total_terms = self._build(content_field)
     self._tfidfs = defaultdict(float)
Beispiel #18
0
def match_search_query(searcher: Searcher, q: Query) -> bool:
    return bool(searcher.search(q))
Beispiel #19
0
class WikiSearcher:

    weighting = {
        'TF_IDF': scoring.TF_IDF,
        'BM25F': scoring.BM25F,
        'FREQUENCY': scoring.Frequency,
    }

    group = {
        'OR': qparser.OrGroup,
        'AND': qparser.AndGroup,
    }

    base_url = 'https://en.wikipedia.org/wiki/'

    def __init__(self, index, page_ranker):
        """
        Creazione del QueryParser relativo al testo della query.
        Aggiungo il plugin 'MultifieldPlugin' al 'QueryParser' perchè mi permette poi nella funzione di 
        'search' di poter modificare il filed boost (cosa che non siamo riusciti a fare se avessimo
        definito direttamente un 'MultifieldParser').

        - MULTIFIELD così effettuo ricerca sia nel titolo che nel testo.
                    Il parser utilizza l'analizzatore definito nell'index, riferito al field corrispondente.
                    Se ho definito 2 analyzer diversi per titolo e testo, questi vengono
                    usati rispettivamente per parsare la query per il titolo e per il testo.
                    ES: testo con stemmer e titolo senza stemmer:
                        query: 'fortified' --> query parsata :'(text:fortifi OR title:fortified)'
                             
        - GROUP default concatena i token con 'AND'. Specificando 'OrGroup' concatena con OR.
                Utilizzando il FACTORY, do un punteggio maggiore ai documenti in cui un certo termine
                ha una frequenza più alta. Senza FACTORY non ho questo effetto.     
        """
        self.index = index

        self.page_ranker = page_ranker

        self.expand = Expander(disambiguate_fn='noun_sense')

        self.parser = qparser.QueryParser(None, index.schema)
        self.multifield_plugin = qparser.MultifieldPlugin(['text', 'title'])
        self.parser.add_plugin(self.multifield_plugin)

        self.weighting = 'BM25F'
        self.searcher = WhooshSearcher(
            reader=self.index.reader(),
            weighting=WikiSearcher.weighting[self.weighting])

    def search(self,
               text,
               limit=10,
               exp=True,
               page_rank=True,
               text_boost=1.0,
               title_boost=1.0,
               weighting='BM25F',
               group='AND'):
        """
        Funzione che esegue la ricerca con i parametri passati in input.
        Se il pagerank è specificato, lo score finale del documento viene calcolato sia in funzione
        dello score fornito dalla ricerca che al valore di pagerank.

        Per prima cosa avviene la fase di settaggio del parser e del weighting con i valori passati in 
        input.
        Dopo di chè avviene il query expansion.
        Poi, avviene il parsing del testo, che viene passato al searcher che ottiene 
        i documenti rilevanti.

        :param self
        :param text: testo che verrà convertito in query dal parser
        :param limit: numero max di documenti ritornati
        :param exp: boolean se abilitare o meno il query expansion
        :param page_rank: boolean se abilitare o meno il pagerank
        :param text_boost: boosting del campo testo
        :param title_boost: boosting del campo titolo
        :param weighting: metodo di weighting
        :param group: come vengono concatenati i token della query

        return dict con i risultati.
        """
        self.multifield_plugin.boosts = {
            'text': text_boost,
            'title': title_boost
        }
        self.parser.group = WikiSearcher.group.get(group, 'AND')

        text, list_token_expanded = self.expand(text) if exp else (text, None)
        query = self.parser.parse(text)

        if weighting != self.weighting:
            print('Imposto il searcher con il weighting : ' + weighting +
                  ' Può impiegare tempo ...')
            self.searcher = WhooshSearcher(
                reader=self.index.reader(),
                weighting=WikiSearcher.weighting.get(weighting, 'BM25F'))
            self.weighting = weighting
            print('Weighting impostato correttamente')

        #print('Query : '+str(query))

        res = {}
        results = self.searcher.search(query, limit=limit)

        res['time_second'] = results.runtime
        res['expanded'] = list_token_expanded if exp else []
        res['n_res'] = results.estimated_length()

        final_score_fn, values_page_rank = self.__combinedScore(
            page_rank, results)
        if page_rank:
            results = sorted(results, key=final_score_fn, reverse=True)

        res['docs'] = [{
            'link':
            WikiSearcher.base_url + result['title'].replace(" ", "_"),
            'title':
            result['title'],
            'highlight':
            result.highlights("text", top=2),
            'final_score':
            final_score_fn(result),
            'score':
            result.score,
            'page_rank':
            values_page_rank.get(result['id_page'], -1)
        } for result in results]

        return res

    def __combinedScore(self, page_rank, results):
        """
        Ritorna il riferimento alla funzione usata per il calcolo dello score finale combinato con 
        il valore di pagerank. In questa funzione ricavo, tramite il 'getRank' del 'pageRanker', 
        i valori di pagerank che ho calcolato in precedenza.

        :param self
        :param page_rank: boolean per capire se serve usare il pagerank per la query corrente
        :param results: risultati della query 
        """
        values_page_rank = {}
        if page_rank:
            values_page_rank = self.page_ranker.getRank(
                [res['id_page'] for res in results], 5)

        def final_score_fn(result):
            if page_rank:
                return result.score * values_page_rank.get(
                    result['id_page'], 1)
            return result.score

        return final_score_fn, values_page_rank

    def getFieldInfo(self, field):
        """
        Ottengo le informazioni riferite al field.
        Funzione utile in fase di debug.
        
        :param field: field di cui voglio le informazioni
        return dict con le info del field specificato
        """
        return {'length': self.searcher.field_length(field)}

    def getGeneralInfo(self):
        """
        Ottengo le informazioni generali riferite all'indice.
        Funzione utile in fase di debug.
        
        :param self:
        return dict con le info
        """
        return {'doc_count': self.searcher.doc_count()}
Beispiel #20
0
    def searcher(self, **kwargs):
        from whoosh.searching import Searcher

        return Searcher(self.reader(), **kwargs)
Beispiel #21
0
    def search(self,
               text,
               limit=10,
               exp=True,
               page_rank=True,
               text_boost=1.0,
               title_boost=1.0,
               weighting='BM25F',
               group='AND'):
        """
        Funzione che esegue la ricerca con i parametri passati in input.
        Se il pagerank è specificato, lo score finale del documento viene calcolato sia in funzione
        dello score fornito dalla ricerca che al valore di pagerank.

        Per prima cosa avviene la fase di settaggio del parser e del weighting con i valori passati in 
        input.
        Dopo di chè avviene il query expansion.
        Poi, avviene il parsing del testo, che viene passato al searcher che ottiene 
        i documenti rilevanti.

        :param self
        :param text: testo che verrà convertito in query dal parser
        :param limit: numero max di documenti ritornati
        :param exp: boolean se abilitare o meno il query expansion
        :param page_rank: boolean se abilitare o meno il pagerank
        :param text_boost: boosting del campo testo
        :param title_boost: boosting del campo titolo
        :param weighting: metodo di weighting
        :param group: come vengono concatenati i token della query

        return dict con i risultati.
        """
        self.multifield_plugin.boosts = {
            'text': text_boost,
            'title': title_boost
        }
        self.parser.group = WikiSearcher.group.get(group, 'AND')

        text, list_token_expanded = self.expand(text) if exp else (text, None)
        query = self.parser.parse(text)

        if weighting != self.weighting:
            print('Imposto il searcher con il weighting : ' + weighting +
                  ' Può impiegare tempo ...')
            self.searcher = WhooshSearcher(
                reader=self.index.reader(),
                weighting=WikiSearcher.weighting.get(weighting, 'BM25F'))
            self.weighting = weighting
            print('Weighting impostato correttamente')

        #print('Query : '+str(query))

        res = {}
        results = self.searcher.search(query, limit=limit)

        res['time_second'] = results.runtime
        res['expanded'] = list_token_expanded if exp else []
        res['n_res'] = results.estimated_length()

        final_score_fn, values_page_rank = self.__combinedScore(
            page_rank, results)
        if page_rank:
            results = sorted(results, key=final_score_fn, reverse=True)

        res['docs'] = [{
            'link':
            WikiSearcher.base_url + result['title'].replace(" ", "_"),
            'title':
            result['title'],
            'highlight':
            result.highlights("text", top=2),
            'final_score':
            final_score_fn(result),
            'score':
            result.score,
            'page_rank':
            values_page_rank.get(result['id_page'], -1)
        } for result in results]

        return res