Esempio n. 1
0
def filterPlaylists(playlist_pickle, index_dir, filterpickle):
    
    with open(playlist_pickle, 'r') as f:
        playlists = pickle.load(f)

    N = len(playlists)
    filtered_playlists = []

    index = whoosh.index.open_dir(index_dir)

    with index.searcher() as searcher:
        searcher.set_caching_policy(save=False)
        qa = whoosh.qparser.SimpleParser('artist', index.schema)
        qt = whoosh.qparser.SimpleParser('title', index.schema)
        
        for x in [qa, qt]:
            x.remove_plugin_class(whoosh.qparser.PlusMinusPlugin)

        for (i, P) in enumerate(playlists):
            filtered_playlists.append(filterThisPlaylist(P, searcher, qa, qt))
            if i % 10 == 0:
                print '%5d/%5d' % (i, N)

    with open(filterpickle, 'w') as f:
        pickle.dump(filtered_playlists, f)

    print '%d playlists' % len(filtered_playlists)
    pass
Esempio n. 2
0
 def __init__(self, model_class, primary, index):
     self.model_class = model_class
     self.primary = primary
     self.index = index
     self.searcher = index.searcher()
     fields = set(index.schema._fields.keys()) - set([self.primary])
     self.parser = MultifieldParser(list(fields), index.schema)
Esempio n. 3
0
def search_tracks(index, title=None, artist=None, num_results=None):
    '''Search an MSD track index'''

    
    if artist:
        q_artist = whoosh.qparser.QueryParser('artist_name', index.schema).parse(artist)

    if title:
        q_title  = whoosh.qparser.QueryParser('title', index.schema).parse(title)

    # Merge the queries
    if title and artist:
        q = whoosh.query.And([q_artist, q_title])
    elif title:
        q = q_title
    elif artist:
        q = q_artist
    else:
        raise ValueError('Invalid query')

    with index.searcher() as search:
        return [dict(item) for item in search.search(q, limit=num_results)]

    # No results
    return []
Esempio n. 4
0
def search_snippet():
    query = request.args.get('query', None)
    page = int(request.args.get('page', 1))

    if query:
        qp = MultifieldParser(["title", "content", "tag"], schema=index.schema)
        q = qp.parse(query)
    else:
        q = Every()

    response = {"results": [], "total": 0}
    with index.searcher() as searcher:
        results = searcher.search_page(q,
                                       page,
                                       pagelen=config.SEARCH_PAGINATION,
                                       sortedby="title")
        for snippet in results:
            response["results"].append({
                'id': snippet['id'],
                'title': snippet['title']
            })
        response["total_snippets"] = len(results)
        response["total_pages"] = (len(results) -
                                   1) / config.SEARCH_PAGINATION + 1
        response["current_page"] = page
        return json.dumps(response)

    return json.dumps(response)
Esempio n. 5
0
    def search(cls,
               search_string,
               values_of='',
               group=whoosh.qparser.OrGroup,
               match_substrings=True,
               limit=None):
        """Searches the fields for given search_string.
        Returns the found records if 'values_of' is left empty,
        else the values of the given columns.

        :param search_string: The string to search for.
        :param values_of: If given, the method will not return the whole
                          records, but only values of given column.
                          Defaults to returning whole records.
        :param group: The whoosh group to use for searching.
                      Defaults to :class:`whoosh.qparser.OrGroup` which
                      searches for all words in all columns.
        :param match_substrings: ``True`` if you want to match substrings,
                                 ``False`` otherwise.
        :param limit: The number of the top records to be returned.
                      Defaults to ``None`` and returns all records.
        """
        index = Whooshee.get_or_create_index(_get_app(cls), cls)
        prepped_string = cls.prep_search_string(search_string,
                                                match_substrings)
        with index.searcher() as searcher:
            parser = whoosh.qparser.MultifieldParser(cls.schema.names(),
                                                     index.schema,
                                                     group=group)
            query = parser.parse(prepped_string)
            results = searcher.search(query, limit=limit)
            if values_of:
                return [x[values_of] for x in results]
            return results
Esempio n. 6
0
 def get_tags_for_filepath(self, filepath):
     index = self.open_index()
     with index.searcher() as searcher:
         result = searcher.document(filepath=filepath)
     if result:
         return result['tags']
     return None
Esempio n. 7
0
 def result_iter():
     with fasteners.InterProcessLock(index_lock_path):
         with index.searcher() as searcher:
             yield from searcher.search(
                 query,
                 limit=None if args.limit < 1 else args.limit,
                 sortedby='timestamp')
Esempio n. 8
0
    def kmeans(self, query, index):
        # creating document collection
        # the collection is like ((doc1: (term1: 4),(term2: 5)),(doc2(...)),...)
        doc_collection = dict()
        with index.searcher() as s:
            # Returns a generator of the document numbers for all documents
            for i in s.document_numbers():
                doc_item = list()
                v = s.vector(i, "content")
                for term_item in v.items_as("frequency"):
                    doc_item.append(term_item)
                doc_collection[i] = doc_item

            #for i in doc_collection.items():
            #print i

        # Creating document collection only with the query terms' frequency
        doc_query_collection = dict()

        # Parsing the query
        # Returns a set of all terms in this query tree.
        for q_item in query.all_terms(phrases=True):
            #first_elts = [x[1] for x in doc_collection.items()]
            for doc in doc_collection:
                print doc
            #for doc in doc_collection.items():
            ## get the value of the key as q_item,
            ## if it does not exist, return 0
            #if q_item in doc[0]:
            #print q_item
##tf = doc.values()[1]
##doc_query_collection[doc] = tf

        for i in doc_query_collection.items():
            print i
Esempio n. 9
0
    def search(cls, search_string, values_of='', group=whoosh.qparser.OrGroup, match_substrings=True, limit=None):
        """Searches the fields for given search_string.
        Returns the found records if 'values_of' is left empty,
        else the values of the given columns.

        :param search_string: The string to search for.
        :param values_of: If given, the method will not return the whole
                          records, but only values of given column.
                          Defaults to returning whole records.
        :param group: The whoosh group to use for searching.
                      Defaults to :class:`whoosh.qparser.OrGroup` which
                      searches for all words in all columns.
        :param match_substrings: ``True`` if you want to match substrings,
                                 ``False`` otherwise.
        :param limit: The number of the top records to be returned.
                      Defaults to ``None`` and returns all records.
        """
        index = Whooshee.get_or_create_index(_get_app(cls), cls)
        prepped_string = cls.prep_search_string(search_string, match_substrings)
        with index.searcher() as searcher:
            parser = whoosh.qparser.MultifieldParser(cls.schema.names(), index.schema, group=group)
            query = parser.parse(prepped_string)
            results = searcher.search(query, limit=limit)
            if values_of:
                return [x[values_of] for x in results]
            return results
Esempio n. 10
0
 def __init__(self, model_class, primary, index):
     self.model_class = model_class
     self.primary = primary
     self.index = index
     self.searcher = index.searcher()
     fields = set(index.schema._fields.keys()) - set([self.primary])
     self.parser = MultifieldParser(list(fields), index.schema)
Esempio n. 11
0
def __parse_execute(artifact, schema, index, include_filepath):
    """ Execute the search for the given commit

    :param artifact: the (file name, artifact) tuple to search for
    :param schema: the search schema to use
    :param index: the search index to use
    :param include_filepath: indicator whether to use the 'file name' part of the artifact into account
    :return: a match list of tuples (file name, artifact, message ID)
    """

    log.devinfo("Searching for artifact ({}, {})...".format(
        artifact[0], artifact[1]))

    result = []

    with index.searcher() as searcher:
        # initialize query parser
        query_parser = QueryParser("content", schema=schema)

        # construct query
        if include_filepath:
            my_query = query_parser.parse(artifact[0] + " AND " + artifact[1])
        else:
            my_query = query_parser.parse(artifact[1])

        # search!
        query_result = searcher.search(my_query, terms=True)

        # construct result from query answer
        for r in query_result:
            result_tuple = (artifact[0], artifact[1], r["messageID"])
            result.append(result_tuple)

    return result
Esempio n. 12
0
    def search(self, trans, search_term, page, page_size, boosts):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        tool_index_dir = os.path.join(trans.app.config.whoosh_index_dir, 'tools')
        index_exists = whoosh.index.exists_in(tool_index_dir)
        if index_exists:
            index = whoosh.index.open_dir(tool_index_dir)
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                tool_weighting = scoring.BM25F(field_B={
                                               'name_B' : boosts.tool_name_boost,
                                               'description_B' : boosts.tool_description_boost,
                                               'help_B' : boosts.tool_help_boost,
                                               'repo_owner_username_B' : boosts.tool_repo_owner_username_boost})
                searcher = index.searcher(weighting=tool_weighting)

                parser = MultifieldParser([
                    'name',
                    'description',
                    'help',
                    'repo_owner_username'], schema=schema)

                user_query = parser.parse('*' + search_term + '*')

                try:
                    hits = searcher.search_page(user_query, page, pagelen=page_size, terms=True)
                except ValueError:
                    raise ObjectNotFound('The requested page does not exist.')

                log.debug('searching tools for: #' + str(search_term))
                log.debug('total hits: ' + str(len(hits)))
                log.debug('scored hits: ' + str(hits.scored_length()))
                results = {}
                results['total_results'] = str(len(hits))
                results['page'] = str(page)
                results['page_size'] = str(page_size)
                results['hits'] = []
                for hit in hits:
                    hit_dict = {}
                    hit_dict['id'] = hit.get('id')
                    hit_dict['repo_owner_username'] = hit.get('repo_owner_username')
                    hit_dict['repo_name'] = hit.get('repo_name')
                    hit_dict['name'] = hit.get('name')
                    hit_dict['description'] = hit.get('description')
                    matched_terms = {k: unicodify(v) for k, v in hit.matched_terms()}
                    results['hits'].append({'tool': hit_dict, 'matched_terms': matched_terms, 'score': hit.score})
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError('The search index file is missing.')
Esempio n. 13
0
    def get_searcher(self):
        """ Returns a reference to the index searcher, creating if necssary. """
        if self.index_searcher:
            return self.index_searcher

        index = self.get_index()
        self.index_searcher = index.searcher()
        return self.index_searcher
Esempio n. 14
0
 def __init__(self, mapping, primary, index, session=None):
     self.mapping = mapping
     self.primary = primary
     self.index = index
     self.session = session
     self.searcher = index.searcher()
     fields = set(index.schema._fields.keys()) - set([self.primary])
     self.parser = MultifieldParser(list(fields), index.schema)
Esempio n. 15
0
def search_db(storage, schema):
    index = storage.open_index(schema=schema)
    searcher = index.searcher()
    parser = QueryParser(content_field_name, schema=schema)
    parsed_query = parser.parse('2020')
    raw_results = searcher.search(parsed_query)
    for hit in raw_results:  
        print hit.highlights(content_field_name)
Esempio n. 16
0
 def search_documents(self, query, field="tags"):
     index = self.open_index()
     qp = QueryParser(field, schema=index.schema)
     q = qp.parse(unicode(query))
     with index.searcher() as searcher:
         results = searcher.search(q, limit=None)
         new_results = [hit.fields() for hit in results]
     return new_results
Esempio n. 17
0
    def get_searcher(self):
        """ Returns a reference to the index searcher, creating if necssary. """
        if self.index_searcher:
            return self.index_searcher

        index = self.get_index()
        self.index_searcher = index.searcher()
        return self.index_searcher
Esempio n. 18
0
def search_db(storage, schema):
    index = storage.open_index(schema=schema)
    searcher = index.searcher()
    parser = QueryParser(content_field_name, schema=schema)
    parsed_query = parser.parse('2020')
    raw_results = searcher.search(parsed_query)
    for hit in raw_results:
        print hit.highlights(content_field_name)
Esempio n. 19
0
    def search( self, trans, search_term, page, page_size, boosts ):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        tool_index_dir = os.path.join( trans.app.config.whoosh_index_dir, 'tools' )
        index_exists = whoosh.index.exists_in( tool_index_dir )
        if index_exists:
            index = whoosh.index.open_dir( tool_index_dir )
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                tool_weighting = scoring.BM25F( field_B={
                                                'name_B' : boosts.tool_name_boost,
                                                'description_B' : boosts.tool_description_boost,
                                                'help_B' : boosts.tool_help_boost,
                                                'repo_owner_username_B' : boosts.tool_repo_owner_username_boost } )
                searcher = index.searcher( weighting=tool_weighting )

                parser = MultifieldParser( [
                    'name',
                    'description',
                    'help',
                    'repo_owner_username' ], schema=tool_schema )

                user_query = parser.parse( '*' + search_term + '*' )

                try:
                    hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True )
                except ValueError:
                    raise ObjectNotFound( 'The requested page does not exist.' )

                log.debug( 'searching tools for: #' + str( search_term ) )
                log.debug( 'total hits: ' + str( len( hits ) ) )
                log.debug( 'scored hits: ' + str( hits.scored_length() ) )
                results = {}
                results[ 'total_results'] = str( len( hits ) )
                results[ 'page'] = str( page )
                results[ 'page_size'] = str( page_size )
                results[ 'hits' ] = []
                for hit in hits:
                    hit_dict = {}
                    hit_dict[ 'id' ] = hit.get( 'id' )
                    hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' )
                    hit_dict[ 'repo_name' ] = hit.get( 'repo_name' )
                    hit_dict[ 'name' ] = hit.get( 'name' )
                    hit_dict[ 'description' ] = hit.get( 'description' )
                    results[ 'hits' ].append( {'tool': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } )
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError( 'The search index file is missing.' )
Esempio n. 20
0
 def search_page(query,
                 indexname=None,
                 page=1,
                 pagelen=20,
                 fields=None,
                 schema=None):
     index = Index.get_index(indexname=indexname, schema=schema)
     q = Index.build_query(index, query, indexname=indexname, fields=fields)
     return index.searcher().search_page(q, page, pagelen=pagelen)
Esempio n. 21
0
    def query(self, query_dict={}, group_by=[]):
        """ Perform a query against an index. 

            query_dict: {
                '<field name>' : ['<value>', '<value>', ...],
                '<field name>' : '<value>',
                ....
            }
            For example:
            {
                'platforms': ['Microsoft Windows NT', 'Microsoft Windows 2000'], 
                'products': 'mozilla', 
                'contributors': 'Jonathan Baker'
            }
        """
        # update the index
        self.update()

        # construct query by looping through schema fields and adding terms
        query_fields = []
        for field in self.get_fieldnames():
            if field in query_dict and query_dict[field]:
                # get field values as list
                values = query_dict[field]
                if isinstance(values, str):
                    values = [values]
                
                # get a whoosh.query.Term for each value
                field_values = []
                for value in values:
                    field_values.append(whoosh.query.Term(field, self.whoosh_escape(value)))

                # OR field values together and add to query_fields list
                query_fields.append(whoosh.query.Or(field_values))

        if query_fields:
            # create query by ANDing query_fields together
            query = query_fields[0] if len(query_fields) == 1 else whoosh.query.And(query_fields)
            #this.message('debug','parsed whoosh query:\n\t{0}'.format(repr(query)))
        else:
            query = whoosh.query.Every()

        # assemble query args
        query_kwargs = { 'scored': False, 'sortedby': False, 'terms': False }
        if group_by:
            query_kwargs['groupedby'] = group_by
            query_kwargs['maptype'] = whoosh.sorting.Count

        # run query against index
        index = self.get_index()
        with index.searcher() as index_searcher:
            results = index_searcher.search(query, **query_kwargs)
            if group_by:
                return results.groups().copy()
            else:
                return [result.fields() for result in results]
Esempio n. 22
0
def query(query_string): # pragma: no cover
    """Searches the index and yields matching item_ids.

    """
    with index.searcher() as searcher:
        q = parser.parse(query_string)
        results = searcher.search(q)

        for hit in results:
            yield hit.fields()
Esempio n. 23
0
def search_artists(index, artist=None, num_results=None):
    '''Search an MSD track index'''

    
    q = whoosh.qparser.QueryParser('artist_name', index.schema).parse(artist)

    with index.searcher() as search:
        return [dict(item) for item in search.search(q, limit=num_results)]

    # No results
    return []
def search_artists(index, name, num_results=None):

    if isinstance(name, str):
        name = unicode(name, errors='ignore')

    q = whoosh.qparser.QueryParser('artist_name', index.schema).parse(name)

    with index.searcher() as search:
        return [(item.score, dict(item)) for item in search.search(q, limit=num_results)]

    return None
Esempio n. 25
0
def key_terms(storage, schema):
    index = storage.open_index(schema=schema)
    ixreader = index.reader()
    searcher = index.searcher()
    docnums = []
    KEY_LEN = 500
    DOC_LEN = 1000
    for id in xrange(DOC_LEN):
        docnums.append(id)
    #for id in ixreader.all_doc_ids():
    #    print id,
    terms = {}
    i = 0
    for term, score in searcher.key_terms(docnums, content_field_name,
                                          KEY_LEN):
        terms[term] = i
        i += 1
    print 'key_terms finished'

    ar = np.zeros((len(docnums), KEY_LEN))
    for i in xrange(DOC_LEN):
        term_weights = ixreader.vector_as("weight", i, content_field_name)
        all_weight = 0
        n = 0
        for term, weight in term_weights:
            if term in terms:
                ar[i][terms[term]] = weight
                all_weight += weight
                n += 1
        for j in xrange(KEY_LEN):
            ar[i][j] = ar[i][j] / weight

    u, s, v = lin.svd(ar, full_matrices=False)
    data = u[:, 0:100]
    print 'svd finished'

    k = KMeans(init='k-means++', n_init=10)
    k.fit(data)
    #centroids = k.cluster_centers_
    labels = k.labels_
    print 'kmeans finished'

    #af = AffinityPropagation(affinity="euclidean").fit(data)
    #cluster_centers_indices = af.cluster_centers_indices_
    #labels = af.labels_

    doc_arr = np.array(range(DOC_LEN))
    for i in range(np.max(labels)):
        print 'group:', (i + 1)
        for doc_num in doc_arr[labels == i]:
            print ixreader.stored_fields(doc_num).get(
                'id'), ixreader.stored_fields(doc_num).get('title').split(
                    '|')[0] + '/',
        print '\n'
Esempio n. 26
0
def search_index_no_page(index, query, index_name, limits=None, filter=None):
    result = []
    try:
        search_field = {"call": ["name"]}
        searcher = index.searcher()
        mparser = MultifieldParser(search_field[index_name],
                                   schema=index.schema)
        q = mparser.parse(query)
        result = searcher.search(q, filter=filter, limit=limits)
    except Exception, e:
        LOG.exception(e)
        result = False
Esempio n. 27
0
 def test_tamilprefix(self):
     f = open(
         '/home/nanditha/projects/tamilthedal/trunk/src/encyclopedia/utilities/pyunitwildtext'
     )
     cont = f.readline()
     text = cont.split(':')
     index = open_dir(settings.INDEX_PATH)
     wildtext = unicode(str(text[0]), 'utf-8') + u'*'
     qp = query.Wildcard("content", wildtext)
     srch = index.searcher()
     res = srch.search(qp)
     self.assertNotEqual(len(res), 0)
     print len(res), 'results'
Esempio n. 28
0
def key_terms(storage, schema):
    index = storage.open_index(schema=schema)
    ixreader = index.reader()
    searcher = index.searcher()
    docnums = []
    KEY_LEN = 500
    DOC_LEN = 1000
    for id in xrange(DOC_LEN):
        docnums.append(id)
    #for id in ixreader.all_doc_ids():
    #    print id,
    terms = {}
    i = 0
    for term,score in searcher.key_terms(docnums, content_field_name, KEY_LEN):
        terms[term] = i
        i += 1
    print 'key_terms finished'

    ar = np.zeros( (len(docnums), KEY_LEN) )
    for i in xrange(DOC_LEN):
        term_weights = ixreader.vector_as("weight", i, content_field_name)
        all_weight = 0
        n = 0
        for term,weight in term_weights:
            if term in terms:
                ar[i][terms[term]] = weight
                all_weight += weight
                n += 1
        for j in xrange(KEY_LEN):
            ar[i][j] = ar[i][j]/weight
    
    u,s,v = lin.svd(ar, full_matrices=False)
    data = u[:,0:100]
    print 'svd finished'

    k = KMeans(init='k-means++', n_init=10)
    k.fit(data)
    #centroids = k.cluster_centers_
    labels = k.labels_
    print 'kmeans finished'

    #af = AffinityPropagation(affinity="euclidean").fit(data)
    #cluster_centers_indices = af.cluster_centers_indices_
    #labels = af.labels_
    
    doc_arr = np.array(range(DOC_LEN))
    for i in range(np.max(labels)):
        print 'group:', (i+1)
        for doc_num in doc_arr[labels==i]:
            print ixreader.stored_fields(doc_num).get('id'), ixreader.stored_fields(doc_num).get('title').split('|')[0]+ '/',
        print '\n'
Esempio n. 29
0
    def search_for_class(self, query, cls, limit=50, filter=None):
        index = self.indexes[cls.__name__]
        searcher = index.searcher()
        fields = set(index.schema._fields.keys()) - set(["uid"])
        parser = MultifieldParser(list(fields), index.schema)

        facets = sorting.Facets()
        facets.add_field("language")
        facets.add_field("mime_type")
        facets.add_field("creator")
        facets.add_field("owner")

        results = searcher.search(parser.parse(query), groupedby=facets, limit=limit, filter=filter)
        return results
Esempio n. 30
0
 def query(self, query_string: str) -> List[papis.document.Document]:
     self.logger.debug('Query string %s' % query_string)
     index = self.get_index()
     qp = whoosh.qparser.MultifieldParser(['title', 'author', 'tags'],
                                          schema=self.get_schema())
     qp.add_plugin(whoosh.qparser.FuzzyTermPlugin())
     query = qp.parse(query_string)
     with index.searcher() as searcher:
         results = searcher.search(query, limit=None)
         self.logger.debug(results)
         documents = [
             papis.document.from_folder(r.get(Database.get_id_key()))
             for r in results]
     return documents
Esempio n. 31
0
def key_all():
    index = storage.open_index(schema=schema)
    searcher = index.searcher()
    reader = searcher.reader()
    cnt = 0
    filename = 'idf.txt'
    accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+", re.UNICODE)
    file = codecs.open(filename, "w", "utf-8")
    file.write('%s\n' % reader.doc_count_all() )
    for term in reader.field_terms('content'):
    #for term in reader.most_frequent_terms('content', 100):
        if not accepted_chars.match(term):
            continue
        term_info = reader.term_info('content', term)
        file.write('%s %d %d\n' % (term, term_info.doc_frequency(), term_info.max_weight()) )
    file.close()
Esempio n. 32
0
def key_all():
    index = storage.open_index(schema=schema)
    searcher = index.searcher()
    reader = searcher.reader()
    cnt = 0
    filename = 'idf.txt'
    accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+", re.UNICODE)
    file = codecs.open(filename, "w", "utf-8")
    file.write('%s\n' % reader.doc_count_all() )
    for term in reader.field_terms('content'):
    #for term in reader.most_frequent_terms('content', 100):
        if not accepted_chars.match(term):
            continue
        term_info = reader.term_info('content', term)
        file.write('%s %d %d\n' % (term, term_info.doc_frequency(), term_info.max_weight()) )
    file.close()
Esempio n. 33
0
def search(query, page=1, per_page=20):
    with index.searcher() as s:
        qp = qparser.MultifieldParser(['title', 'content'], index.schema)
        q = qp.parse(unicode(query))
        try:
            result_page = s.search_page(q, page, pagelen=per_page)
        except ValueError:
            if page == 1:
                return SearchResultPage(None, page)
            return None
        results = result_page.results
        results.highlighter.fragmenter.maxchars = 512
        results.highlighter.fragmenter.surround = 40
        results.highlighter.formatter = highlight.HtmlFormatter('em',
            classname='search-match', termclass='search-term',
            between=u'<span class=ellipsis> … </span>')
        return SearchResultPage(result_page, page)
def filterPlaylists(playlist_dir, index_dir, filterpickle):
    
    filtered_playlists = []

    index = whoosh.index.open_dir(index_dir)

    with index.searcher() as searcher:
        qa = whoosh.qparser.QueryParser('artist', index.schema)
        qt = whoosh.qparser.QueryParser('title', index.schema)
        for (i, P) in enumerate(playlistGenerator(playlist_dir)):
            filtered_playlists.append(filterThisPlaylist(P, searcher, qa, qt))
            print '%6d' % i

    with open(filterpickle, 'w') as f:
        pickle.dump(filtered_playlists, f)

    print '%d playlists' % len(filtered_playlists)
    pass
Esempio n. 35
0
def get_results(_agency, _year, _content_type, user_query):
    global index
    with index.searcher() as searcher:
        agency_in = _agency
        pub_year_in = _year
        content_type_in = _content_type

        user_filter = build_filters(agency_in=agency_in,
                                    content_type_in=content_type_in,
                                    pub_year_in=pub_year_in)

        query = qp.parse(user_query)
        results = searcher.search(query, filter=user_filter)
        new_list = []
        for r in results:
            new_list.append(dict(r))

        return new_list
Esempio n. 36
0
def search(query, page=1, per_page=20):
    with index.searcher() as s:
        qp = qparser.MultifieldParser(['title', 'content'], index.schema)
        q = qp.parse(unicode(query))
        try:
            result_page = s.search_page(q, page, pagelen=per_page)
        except ValueError:
            if page == 1:
                return SearchResultPage(None, page)
            return None
        results = result_page.results
        results.highlighter.fragmenter.maxchars = 512
        results.highlighter.fragmenter.surround = 40
        results.highlighter.formatter = highlight.HtmlFormatter(
            'em',
            classname='search-match',
            termclass='search-term',
            between=u'<span class=ellipsis> … </span>')
        return SearchResultPage(result_page, page)
Esempio n. 37
0
def search_snippet():
    query = request.args.get('query', None)
    page = request.args.get('page', 1)

    if query:
        qp = MultifieldParser(["title", "content", "tag"], schema=index.schema)
        q = qp.parse(query)
    else:
        q = Every()

    response = {"results":[], "total": 0}
    with index.searcher() as searcher:
        results = searcher.search_page(q, page, pagelen=config.SEARCH_PAGINATION, sortedby="title")
        for snippet in results:
            response["results"].append({'id': snippet['id'], 'title': snippet['title']})
        response["total"] = len(results)
        return json.dumps(response)

    return json.dumps(response)
Esempio n. 38
0
def search_db():
    index = storage.open_index(schema=schema)
    searcher = index.searcher()
    parser = QueryParser(index_fieldname, schema=schema)
    parsed_query = parser.parse('%s:%s' % ('id', qq_id))
    raw_results = searcher.search(parsed_query)

    _, names = get_content('qq7')

    corpus_filename = 'name_qq7'
    terms = {}
    corpus_file = codecs.open(corpus_filename, "r", "utf-8")
    for line in corpus_file:
        tokens = line.split(" ")
        term = tokens[0].strip()
        frequency = int(tokens[1].strip())
        terms[term] = frequency

    n_terms = {}
    corpus_filename = 'dict.txt'
    corpus_file = codecs.open(corpus_filename, "r", "utf-8")
    for line in corpus_file:
        tokens = line.split(" ")
        term = tokens[0].strip()
        if len(tokens) >= 3 and tokens[2].find('n') >= 0:
            n_terms[term] = tokens[2]
        else:
            n_terms[term] = ''
    keys = []
    for keyword, score in raw_results.key_terms(index_fieldname,
                                                docs=1000,
                                                numterms=240):
        if keyword in names:
            continue
        if terms.has_key(keyword):
            if not n_terms.has_key(keyword):
                keys.append(keyword)
            elif n_terms.has_key(keyword) and n_terms[keyword].find('n') >= 0:
                keys.append(keyword)
        #keys.append(keyword)
    print ', '.join(keys)
Esempio n. 39
0
def search_db():
    index = storage.open_index(schema=schema)
    searcher = index.searcher()
    parser = QueryParser(index_fieldname, schema=schema)
    parsed_query = parser.parse('%s:%s' % ('id', qq_id))
    raw_results = searcher.search(parsed_query)
    
    _,names = get_content('qq7')

    corpus_filename = 'name_qq7'
    terms = {}
    corpus_file = codecs.open(corpus_filename, "r", "utf-8")
    for line in corpus_file:
        tokens = line.split(" ")
        term = tokens[0].strip()
        frequency = int(tokens[1].strip())
        terms[term] = frequency

    n_terms = {}
    corpus_filename = 'dict.txt'
    corpus_file = codecs.open(corpus_filename, "r", "utf-8")
    for line in corpus_file:
        tokens = line.split(" ")
        term = tokens[0].strip()
        if len(tokens) >= 3 and tokens[2].find('n')>=0:
            n_terms[term] = tokens[2]
        else:
            n_terms[term] = ''
    keys = []
    for keyword, score in raw_results.key_terms(index_fieldname, docs=1000, numterms=240):
        if keyword in names:
            continue
        if terms.has_key(keyword):
            if not n_terms.has_key(keyword):
                keys.append(keyword)
            elif n_terms.has_key(keyword) and n_terms[keyword].find('n')>=0:
                keys.append(keyword)
        #keys.append(keyword)
    print ', '.join(keys)
Esempio n. 40
0
    def search(query,
               indexname=None,
               fields=None,
               limit=10,
               discriminators=None,
               exclude_discriminators=None,
               schema=None):
        index = Index.get_index(indexname=indexname, schema=schema)

        if exclude_discriminators and discriminators:
            q = And([
                Index.build_query(index,
                                  query,
                                  indexname=indexname,
                                  fields=fields),
                Not(
                    Or([
                        Term("discriminator", discriminator)
                        for discriminator in discriminators
                    ]))
            ])
        elif discriminators:
            q = And([
                Index.build_query(index,
                                  query,
                                  indexname=indexname,
                                  fields=fields),
                Or([
                    Term("discriminator", discriminator)
                    for discriminator in discriminators
                ])
            ])
        else:
            q = Index.build_query(index,
                                  query,
                                  indexname=indexname,
                                  fields=fields)

        return index.searcher().search(q, limit=limit)
Esempio n. 41
0
def search(qstring, index_folder="index_fullname"):
    index = whoosh.index.open_dir(os.path.join(dirname, index_folder))
    schema = index.schema
    qp = MultifieldParser(["fullname"], schema=schema)
    q = qp.parse(unicode(qstring))
    with index.searcher() as searcher:
        searchResult = searcher.search(q, limit=20)
        # result = {r["fullname"] for r in searchResult}
        ids = [{
            "rank": index_rank,
            "id": r["id"]
        } for index_rank, r in enumerate(searchResult)]
        # ids = {r for r in searchResult}
        corrector = searcher.corrector("fullname")
        suggestions = []
        if len(ids) == 0:
            suggestions = corrector.suggest(qstring, limit=6)
        # suggestionResults = {s["fullname"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)}
        # result = result.union(suggestionResults)
        # ids_suggestion = [s["id"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)]
        # ids_suggestion = {s for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)}
        # ids = ids+ids_suggestion

    return {"ids": list(ids), "suggestions": suggestions}
Esempio n. 42
0
    def search( self, trans, search_term, page, page_size, boosts ):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)
        :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        whoosh_index_dir = trans.app.config.whoosh_index_dir
        index_exists = whoosh.index.exists_in( whoosh_index_dir )
        if index_exists:
            index = whoosh.index.open_dir( whoosh_index_dir )
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                repo_weighting = RepoWeighting( field_B={ 'name_B' : boosts.repo_name_boost,
                                                          'description_B' : boosts.repo_description_boost,
                                                          'long_description_B' : boosts.repo_long_description_boost,
                                                          'homepage_url_B' : boosts.repo_homepage_url_boost,
                                                          'remote_repository_url_B' : boosts.repo_remote_repository_url_boost,
                                                          'repo_owner_username' : boosts.repo_owner_username_boost } )

                searcher = index.searcher( weighting=repo_weighting )

                parser = MultifieldParser( [
                    'name',
                    'description',
                    'long_description',
                    'homepage_url',
                    'remote_repository_url',
                    'repo_owner_username' ], schema=schema )

                user_query = parser.parse( '*' + search_term + '*' )

                try:
                    hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True )
                except ValueError:
                    raise ObjectNotFound( 'The requested page does not exist.' )

                log.debug( 'searching for: #' + str( search_term ) )
                log.debug( 'total hits: ' + str( len( hits ) ) )
                log.debug( 'scored hits: ' + str( hits.scored_length() ) )
                results = {}
                results[ 'total_results'] = str( len( hits ) )
                results[ 'page'] = str( page )
                results[ 'page_size'] = str( page_size )
                results[ 'hits' ] = []
                for hit in hits:
                    hit_dict = {}
                    hit_dict[ 'id' ] = trans.security.encode_id( hit.get( 'id' ) )
                    hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' )
                    hit_dict[ 'name' ] = hit.get( 'name' )
                    hit_dict[ 'long_description' ] = hit.get( 'long_description' )
                    hit_dict[ 'remote_repository_url' ] = hit.get( 'remote_repository_url' )
                    hit_dict[ 'homepage_url' ] = hit.get( 'homepage_url' )
                    hit_dict[ 'description' ] = hit.get( 'description' )
                    hit_dict[ 'last_updated' ] = hit.get( 'last_updated' )
                    hit_dict[ 'full_last_updated' ] = hit.get( 'full_last_updated' )
                    hit_dict[ 'approved' ] = hit.get( 'approved' )
                    hit_dict[ 'times_downloaded' ] = hit.get( 'times_downloaded' )
                    results[ 'hits' ].append( {'repository': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } )
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError( 'The search index file is missing.' )
Esempio n. 43
0
        searcher - whoosh searcher (create with index.searcher() then close it yourself)
        schema - whoosh schema (index.schema)
        artist - Artist name
        title - Song name
        threshold - Score threshold for a match
    Output:
        best_match - best match for the search, or None of no match
    '''
    arparser = whoosh.qparser.QueryParser('artist', schema)
    tiparser = whoosh.qparser.QueryParser('title', schema)
    q = whoosh.query.And([arparser.parse(unicode(artist, encoding='utf-8')), tiparser.parse(unicode(title, encoding='utf-8'))])
    results = searcher.search(q)
    result = None
    
    if len(results) > 0:
        r = results[0]
        if r.score > threshold:
            result = [r['track_id'], r['artist'], r['title']]
    
    return result

# <codecell>

if __name__=='__main__':
    import os
    if not os.path.exists('Whoosh Indices/cal10k_index/'):
        create_index('Whoosh Indices/cal10k_index/', get_cal10k_list('File Lists/EchoNestTrackIDs.tab') )
    index = get_whoosh_index('Whoosh Indices/cal10k_index/')
    print search( index.searcher(), index.schema, 'queen', 'under pressure' )

Esempio n. 44
0
#!/usr/bin/env python

import whoosh, whoosh.index, whoosh.qparser
import sys
import pprint

if __name__ == '__main__':
    
    index = whoosh.index.open_dir(sys.argv[1])

    with index.searcher() as search:
        q = whoosh.qparser.MultifieldParser(['title', 'artist'], index.schema).parse(unicode(' '.join(sys.argv[2:])))
        results = search.search(q, terms=True)
        for r in results:
            pprint.pprint(r)
            pprint.pprint(r.matched_terms())
            print '---'
Esempio n. 45
0
# qry_flaclist_index.py
#
# Query the index built by mk_flaclist_index.py
# 2014-03-14 Dan Ellis [email protected]

# Querying
import whoosh, whoosh.index, whoosh.qparser
indexdir = 'WCDindexdir'
index = whoosh.index.open_dir(indexdir)
search = index.searcher()

arparser = whoosh.qparser.QueryParser('artist', index.schema)
alparser = whoosh.qparser.QueryParser('album', index.schema)
tiparser = whoosh.qparser.QueryParser('title', index.schema)

# One example query
#artist = u'Darrell Scott'
#album = u'Transatlantic Sessions - Series 3: Volume One'
#title = u'Shattered Cross'
#
#qry = whoosh.query.And([arparser.parse(artist), alparser.parse(album), tiparser.parse(title)])
#results = search.search(qry)
#
#if len(results) == 0:
#    # drop the album
#    qry = whoosh.query.And([arparser.parse(artist), tiparser.parse(title)])
#    results = search.search(qry)
#
#import pprint
#for r in results:
#    pprint.pprint(r)
Esempio n. 46
0
    def search(self, trans, search_term, page, **kwd):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        if search_ready:
            toolshed_whoosh_index_dir = trans.app.config.toolshed_whoosh_index_dir
            index_exists = whoosh.index.exists_in(toolshed_whoosh_index_dir)
            if index_exists:
                index = whoosh.index.open_dir(toolshed_whoosh_index_dir)
                try:
                    # Some literature about BM25F:
                    # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                    # http://en.wikipedia.org/wiki/Okapi_BM25
                    # __Basically__ the higher number the bigger weight.
                    repo_weighting = RepoWeighting(
                        field_B={
                            'name_B': 0.9,
                            'description_B': 0.6,
                            'long_description_B': 0.5,
                            'homepage_url_B': 0.3,
                            'remote_repository_url_B': 0.2,
                            'repo_owner_username': 0.3
                        })

                    searcher = index.searcher(weighting=repo_weighting)

                    parser = MultifieldParser([
                        'name', 'description', 'long_description',
                        'homepage_url', 'remote_repository_url',
                        'repo_owner_username'
                    ],
                                              schema=schema)

                    user_query = parser.parse('*' + search_term + '*')
                    hits = searcher.search_page(user_query,
                                                page,
                                                pagelen=10,
                                                terms=True)

                    log.debug('searching for: #' + str(search_term))
                    log.debug('total hits: ' + str(len(hits)))
                    log.debug('scored hits: ' + str(hits.scored_length()))
                    results = {}
                    results['total_results'] = str(len(hits))
                    results['hits'] = []
                    for hit in hits:
                        hit_dict = {}
                        hit_dict['id'] = trans.security.encode_id(
                            hit.get('id'))
                        hit_dict['repo_owner_username'] = hit.get(
                            'repo_owner_username')
                        hit_dict['name'] = hit.get('name')
                        hit_dict['long_description'] = hit.get(
                            'long_description')
                        hit_dict['remote_repository_url'] = hit.get(
                            'remote_repository_url')
                        hit_dict['homepage_url'] = hit.get('homepage_url')
                        hit_dict['description'] = hit.get('description')
                        hit_dict['last_updated'] = hit.get('last_updated')
                        hit_dict['full_last_updated'] = hit.get(
                            'full_last_updated')
                        hit_dict['approved'] = hit.get('approved')
                        hit_dict['times_downloaded'] = hit.get(
                            'times_downloaded')
                        results['hits'].append({
                            'repository':
                            hit_dict,
                            'matched_terms':
                            hit.matched_terms(),
                            'score':
                            hit.score
                        })
                    return results
                finally:
                    searcher.close()
            else:
                raise exceptions.InternalServerError(
                    'The search index file is missing.')
        else:
            raise exceptions.InternalServerError(
                'Could not initialize search.')
 def __init__(self, primary, index):
     self.primary_key_name = primary
     self._index = index
     self.searcher = index.searcher()
     self._all_fields = list(set(index.schema._fields.keys()) -
                             set([self.primary_key_name]))
Esempio n. 48
0
    def search(self, trans, search_term, page, page_size, boosts):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)
        :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        whoosh_index_dir = trans.app.config.whoosh_index_dir
        index_exists = whoosh.index.exists_in(whoosh_index_dir)
        if index_exists:
            index = whoosh.index.open_dir(whoosh_index_dir)
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                repo_weighting = RepoWeighting(
                    field_B={
                        "name_B": boosts.repo_name_boost,
                        "description_B": boosts.repo_description_boost,
                        "long_description_B": boosts.repo_long_description_boost,
                        "homepage_url_B": boosts.repo_homepage_url_boost,
                        "remote_repository_url_B": boosts.repo_remote_repository_url_boost,
                        "repo_owner_username": boosts.repo_owner_username_boost,
                    }
                )

                searcher = index.searcher(weighting=repo_weighting)

                parser = MultifieldParser(
                    [
                        "name",
                        "description",
                        "long_description",
                        "homepage_url",
                        "remote_repository_url",
                        "repo_owner_username",
                    ],
                    schema=schema,
                )

                user_query = parser.parse("*" + search_term + "*")

                try:
                    hits = searcher.search_page(user_query, page, pagelen=page_size, terms=True)
                except ValueError:
                    raise ObjectNotFound("The requested page does not exist.")

                log.debug("searching for: #" + str(search_term))
                log.debug("total hits: " + str(len(hits)))
                log.debug("scored hits: " + str(hits.scored_length()))
                results = {}
                results["total_results"] = str(len(hits))
                results["page"] = str(page)
                results["page_size"] = str(page_size)
                results["hits"] = []
                for hit in hits:
                    hit_dict = {}
                    hit_dict["id"] = trans.security.encode_id(hit.get("id"))
                    hit_dict["repo_owner_username"] = hit.get("repo_owner_username")
                    hit_dict["name"] = hit.get("name")
                    hit_dict["long_description"] = hit.get("long_description")
                    hit_dict["remote_repository_url"] = hit.get("remote_repository_url")
                    hit_dict["homepage_url"] = hit.get("homepage_url")
                    hit_dict["description"] = hit.get("description")
                    hit_dict["last_updated"] = hit.get("last_updated")
                    hit_dict["full_last_updated"] = hit.get("full_last_updated")
                    hit_dict["approved"] = hit.get("approved")
                    hit_dict["times_downloaded"] = hit.get("times_downloaded")
                    results["hits"].append(
                        {"repository": hit_dict, "matched_terms": hit.matched_terms(), "score": hit.score}
                    )
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError("The search index file is missing.")
Esempio n. 49
0
import whoosh.index as index
from whoosh import columns, fields, index, sorting
from whoosh.qparser import QueryParser

# ix = index.open_dir("./")
# facet = sorting.FieldFacet("id", reverse=True)
# searcher = ix.searcher()
#
# searchwords = "新西兰"
# qp = QueryParser("gtitle", schema=ix.schema)
# q = qp.parse(searchwords)
# results = searcher.search(q, sortedby=facet)
# for each in results:
#     print(each)

from whoosh.qparser import QueryParser
from whoosh.index import open_dir
from whoosh.sorting import FieldFacet

new_list = []
index = open_dir("./index/", indexname='goods')  # 读取建立好的索引
with index.searcher() as searcher:
    parser = QueryParser("gtitle", index.schema)  # 要搜索的项目,比如“phone_name
    myquery = parser.parse("鸭蛋")
    facet = FieldFacet("id", reverse=True)  # 按序排列搜索结果
    results = searcher.search(
        myquery, limit=None, sortedby=facet)  # limit为搜索结果的限制,默认为10,详见博客开头的官方文档
    for result1 in results:
        print(dict(result1))
        new_list.append(dict(result1))
Esempio n. 50
0
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen,
                    indexname, docnum_field, score_field, fieldboosts,
                    search_kwargs):
    import whoosh.index
    import whoosh.query
    import whoosh.qparser

    if not search_kwargs:
        search_kwargs = dict()

    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=True)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    try:

        # figure out header
        hdr = tuple()
        if docnum_field is not None:
            hdr += (docnum_field, )
        if score_field is not None:
            hdr += (score_field, )
        stored_names = tuple(index.schema.stored_names())
        hdr += stored_names
        yield hdr

        # parse the query
        if isinstance(query, string_types):
            # search all fields by default
            parser = whoosh.qparser.MultifieldParser(index.schema.names(),
                                                     index.schema,
                                                     fieldboosts=fieldboosts)
            query = parser.parse(query)
        elif isinstance(query, whoosh.query.Query):
            pass
        else:
            raise ArgumentError(
                'expected string or whoosh.query.Query, found %r' % query)

        # make a function to turn docs into tuples
        astuple = operator.itemgetter(*index.schema.stored_names())

        with index.searcher() as searcher:
            if limit is not None:
                results = searcher.search(query, limit=limit, **search_kwargs)
            else:
                results = searcher.search_page(query,
                                               pagenum,
                                               pagelen=pagelen,
                                               **search_kwargs)

            if docnum_field is None and score_field is None:

                for doc in results:
                    yield astuple(doc)

            else:

                for (docnum, score), doc in izip(results.items(), results):
                    row = tuple()
                    if docnum_field is not None:
                        row += (docnum, )
                    if score_field is not None:
                        row += (score, )
                    row += astuple(doc)
                    yield row

    except:
        raise

    finally:
        if needs_closing:
            # close the index if we're the ones who opened it
            index.close()


# TODO guess schema
 def __init__(self, primary, index):
     self.primary_key_name = primary
     self._index = index
     self.searcher = index.searcher()
     self._all_fields = list(set(index.schema._fields.keys()) -
                             set([self.primary_key_name]))
Esempio n. 52
0
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen, indexname,
                    docnum_field, score_field, fieldboosts, search_kwargs):
    import whoosh.index
    import whoosh.query
    import whoosh.qparser

    if not search_kwargs:
        search_kwargs = dict()

    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=True)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r'
                            % index_or_dirname)

    try:

        # figure out header
        hdr = tuple()
        if docnum_field is not None:
            hdr += (docnum_field,)
        if score_field is not None:
            hdr += (score_field,)
        stored_names = tuple(index.schema.stored_names())
        hdr += stored_names
        yield hdr

        # parse the query
        if isinstance(query, string_types):
            # search all fields by default
            parser = whoosh.qparser.MultifieldParser(
                index.schema.names(),
                index.schema,
                fieldboosts=fieldboosts
            )
            query = parser.parse(query)
        elif isinstance(query, whoosh.query.Query):
            pass
        else:
            raise ArgumentError(
                'expected string or whoosh.query.Query, found %r' % query
            )

        # make a function to turn docs into tuples
        astuple = operator.itemgetter(*index.schema.stored_names())

        with index.searcher() as searcher:
            if limit is not None:
                results = searcher.search(query, limit=limit,
                                          **search_kwargs)
            else:
                results = searcher.search_page(query, pagenum,
                                               pagelen=pagelen,
                                               **search_kwargs)

            if docnum_field is None and score_field is None:

                for doc in results:
                    yield astuple(doc)

            else:

                for (docnum, score), doc in izip(results.items(), results):
                    row = tuple()
                    if docnum_field is not None:
                        row += (docnum,)
                    if score_field is not None:
                        row += (score,)
                    row += astuple(doc)
                    yield row

    except:
        raise

    finally:
        if needs_closing:
            # close the index if we're the ones who opened it
            index.close()


# TODO guess schema
Esempio n. 53
0
    def search(self,
               q,
               index='default',
               fields=None,
               Models=(),
               object_types=(),
               prefix=True,
               facet_by_type=None,
               **search_args):
        """Interface to search indexes.

        :param q: unparsed search string.
        :param index: name of index to use for search.
        :param fields: optionnal mapping of field names -> boost factor?
        :param Models: list of Model classes to limit search on.
        :param object_types: same as `Models`, but directly the model string.
        :param prefix: enable or disable search by prefix
        :param facet_by_type: if set, returns a dict of object_type: results with a
             max of `limit` matches for each type.
        :param search_args: any valid parameter for
            :meth:`whoosh.searching.Search.search`. This includes `limit`,
            `groupedby` and `sortedby`
        """
        index = self.app_state.indexes[index]
        if not fields:
            fields = self.default_search_fields

        valid_fields = {
            f
            for f in index.schema.names(check_names=fields)
            if prefix or not f.endswith('_prefix')
        }

        for invalid in set(fields) - valid_fields:
            del fields[invalid]

        parser = DisMaxParser(fields, index.schema)
        query = parser.parse(q)

        filters = search_args.setdefault('filter', None)
        filters = [filters] if filters is not None else []
        del search_args['filter']

        if not hasattr(g, 'is_manager') or not g.is_manager:
            # security access filter
            user = current_user
            roles = {indexable_role(user)}
            if not user.is_anonymous:
                roles.add(indexable_role(Anonymous))
                roles.add(indexable_role(Authenticated))
                roles |= {indexable_role(r) for r in security.get_roles(user)}

            filter_q = wq.Or(
                [wq.Term('allowed_roles_and_users', role) for role in roles], )
            filters.append(filter_q)

        object_types = set(object_types)
        for m in Models:
            object_type = m.entity_type
            if not object_type:
                continue
            object_types.add(object_type)

        if object_types:
            object_types &= self.app_state.indexed_fqcn
        else:
            # ensure we don't show content types previously indexed but not yet
            # cleaned from index
            object_types = self.app_state.indexed_fqcn

        # limit object_type
        filter_q = wq.Or([wq.Term('object_type', t) for t in object_types])
        filters.append(filter_q)

        for func in self.app_state.search_filter_funcs:
            filter_q = func()
            if filter_q is not None:
                filters.append(filter_q)

        if filters:
            filter_q = wq.And(filters) if len(filters) > 1 else filters[0]
            # search_args['filter'] = filter_q
            query = filter_q & query

        if facet_by_type:
            if not object_types:
                object_types = [t[0] for t in self.searchable_object_types()]

            # limit number of documents to score, per object type
            collapse_limit = 5
            search_args['groupedby'] = 'object_type'
            search_args['collapse'] = 'object_type'
            search_args['collapse_limit'] = collapse_limit
            search_args['limit'] = (search_args['collapse_limit'] * max(
                len(object_types),
                1,
            ))

        with index.searcher(closereader=False) as searcher:
            # 'closereader' is needed, else results cannot by used outside 'with'
            # statement
            results = searcher.search(query, **search_args)

            if facet_by_type:
                positions = {
                    doc_id: pos
                    for pos, doc_id in enumerate(i[1] for i in results.top_n)
                }
                sr = results
                results = {}
                for typename, doc_ids in sr.groups('object_type').items():
                    results[typename] = [
                        sr[positions[oid]] for oid in doc_ids[:collapse_limit]
                    ]

            return results
Esempio n. 54
0
    def search(self, trans, search_term, page, page_size, boosts):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)
        :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py
        :param page_size: integer defining a length of one page
        :param page: integer with the number of page requested

        :returns results: dictionary containing hits themselves and the hits summary
        """
        log.debug('raw search query: #' + str(search_term))
        lower_search_term = search_term.lower()
        allow_query, search_term_without_filters = self._parse_reserved_filters(
            lower_search_term)
        log.debug('term without filters: #' + str(search_term_without_filters))

        whoosh_index_dir = trans.app.config.whoosh_index_dir
        index_exists = whoosh.index.exists_in(whoosh_index_dir)
        if index_exists:
            index = whoosh.index.open_dir(whoosh_index_dir)
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                repo_weighting = RepoWeighting(
                    field_B={
                        'name_B': boosts.repo_name_boost,
                        'description_B': boosts.repo_description_boost,
                        'long_description_B':
                        boosts.repo_long_description_boost,
                        'homepage_url_B': boosts.repo_homepage_url_boost,
                        'remote_repository_url_B':
                        boosts.repo_remote_repository_url_boost,
                        'repo_owner_username_B':
                        boosts.repo_owner_username_boost,
                        'categories_B': boosts.categories_boost
                    })
                searcher = index.searcher(weighting=repo_weighting)
                parser = MultifieldParser([
                    'name', 'description', 'long_description', 'homepage_url',
                    'remote_repository_url', 'repo_owner_username',
                    'categories'
                ],
                                          schema=schema)

                # If user query has just filters prevent wildcard search.
                if len(search_term_without_filters) < 1:
                    user_query = Every('name')
                    sortedby = 'name'
                else:
                    user_query = parser.parse('*' +
                                              search_term_without_filters +
                                              '*')
                    sortedby = ''
                try:
                    hits = searcher.search_page(user_query,
                                                page,
                                                pagelen=page_size,
                                                filter=allow_query,
                                                terms=True,
                                                sortedby=sortedby)
                    log.debug('total hits: ' + str(len(hits)))
                    log.debug('scored hits: ' + str(hits.scored_length()))
                except ValueError:
                    raise ObjectNotFound('The requested page does not exist.')
                results = {}
                results['total_results'] = str(len(hits))
                results['page'] = str(page)
                results['page_size'] = str(page_size)
                results['hits'] = []
                for hit in hits:
                    log.debug('matched terms: ' + str(hit.matched_terms()))
                    hit_dict = {}
                    hit_dict['id'] = trans.security.encode_id(hit.get('id'))
                    hit_dict['repo_owner_username'] = hit.get(
                        'repo_owner_username')
                    hit_dict['name'] = hit.get('name')
                    hit_dict['long_description'] = hit.get('long_description')
                    hit_dict['remote_repository_url'] = hit.get(
                        'remote_repository_url')
                    hit_dict['homepage_url'] = hit.get('homepage_url')
                    hit_dict['description'] = hit.get('description')
                    hit_dict['last_updated'] = hit.get('last_updated')
                    hit_dict['full_last_updated'] = hit.get(
                        'full_last_updated')
                    hit_dict['repo_lineage'] = hit.get('repo_lineage')
                    hit_dict['categories'] = hit.get('categories')
                    hit_dict['approved'] = hit.get('approved')
                    hit_dict['times_downloaded'] = hit.get('times_downloaded')
                    results['hits'].append({
                        'repository': hit_dict,
                        'score': hit.score
                    })
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError(
                'The search index file is missing.')
Esempio n. 55
0
def get_snippet_by_id(snippet_id):
    return index.searcher().document(id=snippet_id)
Esempio n. 56
0
def get_snippet_by_id(snippet_id):
    return index.searcher().document(id=snippet_id)
Esempio n. 57
0
# qry_flaclist_index.py
#
# Query the index built by mk_flaclist_index.py
# 2014-03-14 Dan Ellis [email protected]

# Querying
import whoosh, whoosh.index, whoosh.qparser
indexdir = 'WCDindexdir'
index = whoosh.index.open_dir(indexdir)
search = index.searcher()

arparser = whoosh.qparser.QueryParser('artist', index.schema)
alparser = whoosh.qparser.QueryParser('album', index.schema)
tiparser = whoosh.qparser.QueryParser('title', index.schema)

# One example query
#artist = u'Darrell Scott'
#album = u'Transatlantic Sessions - Series 3: Volume One'
#title = u'Shattered Cross'
#
#qry = whoosh.query.And([arparser.parse(artist), alparser.parse(album), tiparser.parse(title)])
#results = search.search(qry)
#
#if len(results) == 0:
#    # drop the album
#    qry = whoosh.query.And([arparser.parse(artist), tiparser.parse(title)])
#    results = search.search(qry)
#
#import pprint
#for r in results:
#    pprint.pprint(r)