Python Stem Examples, xapian.Stem Python Examples

Example #1

0

Show file

def find_objects(collection, keywords, docs=False):
    colpath = _get_path_to_collection(collection)
    if not _dir_exists(colpath):
        _create_path(colpath)

    # Collection/search initialisation
    db = _open_collection(colpath, READ)
    enq = xapian.Enquire(db)
    stem = xapian.Stem("dutch")
    qp = xapian.QueryParser()

    # Query-parsing
    qp.set_database(db)
    qp.set_stemmer(stem)
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(keywords)
    log.debug("Searching collection %r with query: %s", collection, str(query))

    enq.set_query(query)
    matches = enq.get_mset(0, 10000)
    log.debug("Found approx %d results", matches.get_matches_estimated())

    # Return documents
    if docs:
        return [m for m in matches]
    else:
        return [m.document.get_data() for m in matches]

Example #2

0

Show file

File: indexer.py Project: mario-espinoza/ir-with-xapian

def index(datapath, dbpath):
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
    termgenerator = xapian.TermGenerator()
    termgenerator.set_stemmer(xapian.Stem("en"))

    for fields in parse_csv_file(datapath):
        title = fields.get('TITLE', u'')
        body = fields.get('BODY', u'')
        textClass = fields.get('CLASS', u'')
        identifier = fields.get('ID', u'')
        print '{}'.format(title)
        print '{}'.format(body)

        doc = xapian.Document()
        termgenerator.set_document(doc)

        termgenerator.index_text(textClass, 1, 'C')
        termgenerator.index_text(body, 1, 'B')
        termgenerator.index_text(identifier, 1, 'I')

        termgenerator.index_text(textClass)
        termgenerator.increase_termpos()
        termgenerator.index_text(body)
        termgenerator.increase_termpos()
        termgenerator.index_text(identifier)

        doc.set_data(json.dumps(fields, ensure_ascii=False, encoding="utf-8"))

        idterm = u"Q" + identifier
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)

Example #3

0

Show file

File: xapianSearch.py Project: blaze225/factoid-QA-system

def searchDoc(queryString):
    # Parse query string
    queryParser = xapian.QueryParser()
    queryParser.set_stemmer(xapian.Stem('english'))
    queryParser.set_database(database)
    queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = queryParser.parse_query(queryString)
    # Set offset and limit for pagination
    offset, limit = 0, database.get_doccount()
    # Start query session
    enquire = xapian.Enquire(database)
    enquire.set_query(query)
    # Display matches
    matches = enquire.get_mset(offset, limit)
    listToReturn = []
    for i, match in enumerate(matches):
        # print '==================='
        # print 'rank=%s, documentID=%s' % (match.rank, match.docid)
        # print '-------------------'
        # print match.document.get_data()
        listToReturn.append(match.document.get_data())
        if (i > 9):
            break
    # # print '==================='
    # # print 'Number of documents matching query: %s' % matches.get_matches_estimated()
    # # print 'Number of documents returned: %s' % matches.size()
    return listToReturn

Example #4

0

Show file

File: xapian_indexer.py Project: tf198/annex-librarian

    def __init__(self, path):
        self.path = path

        self.term_generator = xapian.TermGenerator()
        self.term_generator.set_stemmer(xapian.Stem("en"))
        self.term_generator.set_stemming_strategy(STEMMING)

        self.query_parser = xapian.QueryParser()
        self.query_parser.set_stemmer(xapian.Stem("en"))
        self.query_parser.set_stemming_strategy(STEMMING)

        for field, prefix in terms.STEMMED:
            self.query_parser.add_prefix(field, prefix)
        for field, prefix in terms.PREFIXED_UNSTEMMED:
            self.query_parser.add_prefix(field, prefix)
        for field, prefix in terms.PREFIXED_UNSTEMMED_BOOLEAN:
            self.query_parser.add_boolean_prefix(field, prefix)

Example #5

0

Show file

File: index.py Project: chkr-private/fedora-packages

 def create_index(self):
     """ Create a new index, and set up its field structure """
     log.warning("start create_index")
     self.db = xapian.WritableDatabase(self.dbpath,
                                       xapian.DB_CREATE_OR_OPEN)
     self.indexer = xapian.TermGenerator()
     self.indexer.set_stemmer(xapian.Stem("en"))
     log.warning("end create_index")

Example #6

0

Show file

File: query-sonnets.py Project: xingfeT/sonnetsdemo

def _parseq(x_db, query, prefix=''):
    '''parse and return a QueryParser query'''
    qp = _x.QueryParser()
    stemmer = _x.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(x_db)
    qp.set_stemming_strategy(_x.QueryParser.STEM_SOME)
    return qp.parse_query(query, 0, prefix)

Example #7

0

Show file

File: highlight.py Project: execgit/gwiki-with-moin

    def __init__(self, language_code='en', stemmer=None):
        """Create a new highlighter for the specified language.

        """
        if stemmer is not None:
            self.stem = stemmer
        else:
            self.stem = xapian.Stem(language_code)

Example #8

0

Show file

 def _parse_query(self, text):
     database = self._index.index
     # Parse the query string to produce a Xapian::Query object.
     self.parser = xapian.QueryParser()
     self.parser.set_stemmer(xapian.Stem("english"))
     self.parser.set_database(database)
     self.parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
     self.query = self.parser.parse_query(text, DEFAULT_SEARCH_FLAGS)

Example #9

0

Show file

File: index_values_with_geo.py Project: EQ94/XapianInJD

def index(datapath, dbpath):
    # Create or open the database we're going to be writing to.
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

    # Set up a TermGenerator that we'll use in indexing.
    termgenerator = xapian.TermGenerator()
    termgenerator.set_stemmer(xapian.Stem("en"))

    for fields in parse_states(datapath):
        # 'fields' is a dictionary mapping from field name to value.
        # Pick out the fields we're going to index.
        name = fields.get('name', u'')
        description = fields.get('description', u'')
        motto = fields.get('motto', u'')
        admitted = fields.get('admitted', None)
        population = fields.get('population', None)
        order = fields.get('order', u'')

        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)

        # index each field with a suitable prefix
        termgenerator.index_text(name, 1, 'S')
        termgenerator.index_text(description, 1, 'XD')
        termgenerator.index_text(motto, 1, 'XM')

        # Index fields without prefixes for general search.
        termgenerator.index_text(name)
        termgenerator.increase_termpos()
        termgenerator.index_text(description)
        termgenerator.increase_termpos()
        termgenerator.index_text(motto)

        # Add document values.
        if admitted is not None:
            doc.add_value(1, xapian.sortable_serialise(int(admitted[:4])))
            doc.add_value(2, admitted)  # YYYYMMDD
        if population is not None:
            doc.add_value(3, xapian.sortable_serialise(int(population)))
### Start of example code.
        midlat = fields['midlat']
        midlon = fields['midlon']
        if midlat and midlon:
            doc.add_value(4, "%f,%f" % (float(midlat), float(midlon)))


### End of example code.

# Store all the fields for display purposes.
        doc.set_data(json.dumps(fields))

        # We use the order to ensure each object ends up in the
        # database only once no matter how many times we run the
        # indexer.
        idterm = u"Q" + order
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)

Example #10

0

Show file

def query_parser(x_db):
    qp = _x.QueryParser()
    stemmer = _x.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(x_db)
    qp.set_stemming_strategy(_x.QueryParser.STEM_SOME)
    for k, v in TERM_MAP.iteritems():
        qp.add_prefix(k, v)
    return qp

Example #11

0

Show file

def hello_world(request):
    # Open the database for searching.
    database = xapian.Database(
        "/home/pb/Documents/python/django_test/mysite/firsttest/index")

    # Start an enquire session.
    enquire = xapian.Enquire(database)

    # Combine the rest of the command line arguments with spaces between
    # them, so that simple queries don't have to be quoted at the shell
    # level.

    #second parameter is the search key word
    #str_key = request.session['key_word']
    str_key = request.session.get('key_word')
    query_string = str.join(' ', str(str_key))

    # Parse the query string to produce a Xapian::Query object.
    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)
    print "Parsed query is: %s" % str(query)

    # Find the top 10 results for the query.
    enquire.set_query(query)
    matches = enquire.get_mset(0, 10)

    # Display the results.
    print "%i results found." % matches.get_matches_estimated()
    result_found = matches.get_matches_estimated()
    print "Results 1-%i:" % matches.size()

    #remove all the data in the database first, and then displace them
    search_result1 = search_result.objects.all()
    search_result1.delete()
    i = 0
    for m in matches:
        i = i + 1
        #generate the filename
        temp_str = str(m.document.get_data()).split(" sample=")
        temp = temp_str[0]
        file_name = temp.strip("url=/")
        print "result file name ::: " + file_name[0:4].strip("\ns")
        file_name1 = file_name[0:4].strip("\ns")
        #create object
        search_result.objects.create(result_id=i,
                                     total_result_number=str(result_found),
                                     match_rate=str(m.percent),
                                     doc_id=str(file_name1),
                                     content=str(m.document.get_data()))
    #the file name is store in url

    return render(request, 'hello_world.html',
                  {'result': (search_result.objects.all())})

Example #12

0

Show file

File: pythontest3.py Project: Lericchen92/XapianModified

 def make_tg():
     termgen = xapian.TermGenerator()
     termgen.set_stemmer(xapian.Stem('en'))
     stopper = xapian.SimpleStopper()
     stopper.add('to')
     stopper.add('not')
     termgen.set_stopper(stopper)
     del stopper
     return termgen

Example #13

0

Show file

File: PTVXapian.py Project: marwahaha/penguintv

	def Do_Index(self, callback=None):
		"""loop through all feeds and entries and feed them to the beast"""
		
		def index_interrupt():
			self._indexing = False
			self._index_lock.release()
			if callback is not None:
				callback()
			self._interrupt()
			return

		if not self._index_lock.acquire(False):
			logging.info("already indexing, not trying to reindex again")
			return
		
		self._indexing = True
		db = self._get_db()
		c = db.cursor()
		
		#remove existing DB
		utils.deltree(self._storeDir)
		
		database = xapian.WritableDatabase(self._storeDir, xapian.DB_CREATE_OR_OPEN)
		indexer = xapian.TermGenerator()
		stemmer = xapian.Stem("english")
		indexer.set_stemmer(stemmer)
		
		c.execute(u"""SELECT id, title, description FROM feeds""")
		feeds = c.fetchall()
		c.execute(u"""SELECT id, feed_id, title, description,fakedate FROM entries ORDER BY fakedate""")
		entries = c.fetchall()
		c.close()
		db.close()
		
		logging.info("indexing feeds")
		
		def feed_index_generator(feeds):			
			for feed_id, title, description in feeds:
				try:
					doc = xapian.Document() 
					
					forindex = title+" "+description
					
					#eh?  we can only remove docs by term, but we can only
					#get values.  so we need both it seems
					doc.add_term("f"+str(feed_id))
					doc.add_value(FEED_ID, str(feed_id))
					doc.add_value(DATE, "")
					
					doc.set_data(forindex)
					indexer.set_document(doc)
					indexer.index_text(forindex)
					
					#database.add_document(doc)
					yield doc
				except Exception, e:
					logging.error("Failed in indexDocs, feeds: %s" % str(e))

Example #14

0

Show file

File: PTVXapian.py Project: marwahaha/penguintv

	def Search(self, command, blacklist=[], include=['feeds','entries'], since=0):
		"""returns two lists, one of search results in feeds, and one for results in entries.  It
		is sorted so that title results are first, description results are second"""
		
		if not self._index_lock.acquire(False):
			#if we are indexing, don't try to search
			#print "wouldn't get lock"
			return ([],[])
		self._index_lock.release()
		
		
		database = xapian.Database(self._storeDir)
		enquire = xapian.Enquire(database)
		
		qp = xapian.QueryParser()
		stemmer = xapian.Stem("english")
		qp.set_stemmer(stemmer)
		qp.set_database(database)
		qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
		
		enquire.set_docid_order(xapian.Enquire.DESCENDING)
		enquire.set_weighting_scheme(xapian.BoolWeight())

		# Display the results.
		#print "%i results found." % matches.get_matches_estimated()
		#print "Results 1-%i:" % matches.size()

		#for m in matches:
		#    print "%i: %i%% docid=%i [%s] %s %s %s" % (m.rank + 1, m.percent, m.docid, m.document.get_data()[0:100], m.document.get_value(0), m.document.get_value(1), m.document.get_value(2))
		
		feed_results=[]
		entry_results=[]
		
		query = qp.parse_query(command)
		enquire.set_query(query)
		matches = enquire.get_mset(0, 100)
		for m in matches:
			doc = m.document
			feed_id = doc.get_value(FEED_ID)
			feed_id = int(feed_id)
			try:
				if feed_id not in blacklist:
					entry_id = doc.get_value(ENTRY_ID)
					if entry_id is '': # meaning this is actually a feed (we could know that from above, but eh)
						feed_results.append(int(feed_id))
					else: # meaning "entry"
						title    = doc.get_value(ENTRY_TITLE)
						fakedate = float(doc.get_value(DATE)) / 1000.0
						if fakedate > since:
							entry_results.append((int(entry_id),title, fakedate, feed_id))
				#else:
				#	print "excluding:"+doc.get("title")
			except Exception, e:
				print e
				print feed_id
				print blacklist

Example #15

0

Show file

def search(dbpath, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")

    # And parse the query
    query = queryparser.parse_query(querystring)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    # Start of example code.
    class DistanceKeyMaker(xapian.KeyMaker):
        def __call__(self, doc):
            # we want to return a sortable string which represents
            # the distance from Washington, DC to the middle of this
            # state.
            value = doc.get_value(4).decode('utf8')
            x, y = map(float, value.split(','))
            washington = (38.012, -77.037)
            return xapian.sortable_serialise(
                support.distance_between_coords((x, y), washington))

    enquire.set_sort_by_key_then_relevance(DistanceKeyMaker(), False)
    # End of example code.

    # And print out something about each match
    matches = []
    for match in enquire.get_mset(offset, pagesize):
        fields = json.loads(match.document.get_data().decode('utf8'))
        print(
            u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n        Population %(pop)s"
            % {
                'rank': match.rank + 1,
                'docid': match.docid,
                'name': fields.get('name', u''),
                'date': support.format_date(fields.get('admitted', u'')),
                'pop': support.format_numeral(int(fields.get('population',
                                                             0))),
                'lat': fields.get('latitude', u''),
                'lon': fields.get('longitude', u''),
            })
        matches.append(match.docid)

    # Finally, make sure we log the query and displayed results
    support.log_matches(querystring, offset, pagesize, matches)

Example #16

0

Show file

def phrase(terms, prefix=None, language=None, window=3):
    if isinstance(terms, basestring):
        terms = terms.split()
    if prefix:
        terms = ['%s:%s' % (prefix, t) for t in terms]
    if language:
        stem = xapian.Stem(language)
        terms = ['Z%s' % stem(t) for t in terms]
    print terms
    return Query(Query.OP_PHRASE, terms, window)

Example #17

0

Show file

File: searcher.py Project: yasserglez/arachne

 def __init__(self, database_dir):
     """Initialize the searcher.
     """
     index_dir = os.path.join(database_dir, IndexProcessor.INDEX_DIR)
     self._db = xapian.Database(index_dir)
     # Create the stemmers.
     self._stemmers = []
     for lang in IndexProcessor.STEM_LANGS:
         stemmer = xapian.Stem(lang)
         self._stemmers.append(stemmer)

Example #18

0

Show file

File: index.py Project: kit494way/xavian

    def __init__(self, dbpath, *, cjk=False):
        """Initialize indexer with dbpath."""
        self._db = None
        self.dbpath = dbpath
        self.term_generator = xapian.TermGenerator()
        self.term_generator.set_stemmer(xapian.Stem("en"))

        if cjk:
            self.term_generator.set_flags(self.term_generator.FLAG_CJK_NGRAM)
            logger.info("FLAG_CJK_NGRAM enabled")
        self.open()

Example #19

0

Show file

    def __init__(self, db_path):
        try:
            # Open the database for update, creating a new database if necessary.
            self.database = xapian.WritableDatabase(db_path,
                                                    xapian.DB_CREATE_OR_OPEN)

            self.indexer = xapian.TermGenerator()
            self.stemmer = xapian.Stem("english")  # XXX
            self.indexer.set_stemmer(self.stemmer)
        except:
            raise

Example #20

0

Show file

File: search_filters.py Project: EQ94/XapianInJD

def search(dbpath, querystring, materials, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    ### Start of example code.
    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")

    # And parse the query
    query = queryparser.parse_query(querystring)

    if len(materials) > 0:
        # Filter the results to ones which contain at least one of the
        # materials.

        # Build a query for each material value
        material_queries = [
            xapian.Query('XM' + material.lower()) for material in materials
        ]

        # Combine these queries with an OR operator
        material_query = xapian.Query(xapian.Query.OP_OR, material_queries)

        # Use the material query to filter the main query
        query = xapian.Query(xapian.Query.OP_FILTER, query, material_query)


### End of example code.

# Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    # And print out something about each match
    matches = []
    for match in enquire.get_mset(offset, pagesize):
        fields = json.loads(match.document.get_data())
        print(
            u"%(rank)i: #%(docid)3.3i %(title)s" % {
                'rank': match.rank + 1,
                'docid': match.docid,
                'title': fields.get('TITLE', u''),
            })
        matches.append(match.docid)

    # Finally, make sure we log the query and displayed results
    support.log_matches(querystring, offset, pagesize, matches)

Example #21

0

Show file

 def search(self, query_string, offset=0, numresults=10):
     database = self.get_database()
     enquire = xapian.Enquire(database)
     qp = xapian.QueryParser()
     stemmer = xapian.Stem("english")
     qp.set_stemmer(stemmer)
     qp.set_database(database)
     qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
     query = qp.parse_query(query_string)
     enquire.set_query(query)
     matches = enquire.get_mset(offset, numresults)
     return matches

Example #22

0

Show file

def query_fact_normally(fact, k_matches):

    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.add_prefix("text", "")
    queryparser.add_prefix("title", "S")
    query = queryparser.parse_query(my_own_queryparser(fact))
    enquire = xapian.Enquire(db)
    # enquire.set_weighting_scheme(xapian.TfIdfWeight())
    enquire.set_query(query)
    return list(match.document.get_data().decode("utf8")
                for match in enquire.get_mset(0, k_matches))

Example #23

0

Show file

def index(datapath, dbpath):
    # Create or open the database we're going to be writing to.
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

    # Set up a TermGenerator that we'll use in indexing.
    termgenerator = xapian.TermGenerator()
    termgenerator.set_stemmer(xapian.Stem("en"))

    for fields in parse_csv_file(datapath):
        # 'fields' is a dictionary mapping from field name to value.
        # Pick out the fields we're going to index.
        description = fields.get('DESCRIPTION', u'')
        title = fields.get('TITLE', u'')
        identifier = fields.get('id_NUMBER', u'')

        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)

        # Index each field with a suitable prefix.
        termgenerator.index_text(title, 1, 'S')
        termgenerator.index_text(description, 1, 'XD')

        # Index fields without prefixes for general search.
        termgenerator.index_text(title)
        termgenerator.increase_termpos()
        termgenerator.index_text(description)

        # Store all the fields for display purposes.
        doc.set_data(json.dumps(fields, encoding='utf8'))

        ### Start of example code.
        # parse the two values we need
        measurements = fields.get('MEASUREMENTS', u'')
        if measurements != u'':
            numbers = numbers_from_string(measurements)
            if len(numbers) > 0:
                doc.add_value(0, xapian.sortable_serialise(max(numbers)))

        date_made = fields.get('DATE_MADE', u'')
        years = numbers_from_string(date_made)
        if len(years) > 0:
            doc.add_value(1, xapian.sortable_serialise(years[0]))


### End of example code.

# We use the identifier to ensure each object ends up in the
# database only once no matter how many times we run the
# indexer.
        idterm = u"Q" + identifier
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)

Example #24

0

Show file

    def reload_database(self):  # {{{
        # create the xapian handlers
        self.db = xapian.WritableDatabase(self.database,
                                          xapian.DB_CREATE_OR_OPEN)

        self.qp = xapian.QueryParser()
        self.qp.set_stemmer(xapian.Stem(self.language))
        self.qp.set_stemming_strategy(self.qp.STEM_SOME)
        self.qp.add_prefix("title", "S")

        self.tg = xapian.TermGenerator()
        self.tg.set_stemmer(xapian.Stem(self.language))
        try:
            self.tg.set_stemming_strategy(self.tg.STEM_SOME)
        except AttributeError:
            pass

        self.e = xapian.Enquire(self.db)
        self.sorted_e = xapian.Enquire(self.db)
        # Value 2 is the lowercase form of the title
        self.sorted_e.set_sort_by_value(2, False)

Example #25

0

Show file

File: indexationXapian.py Project: ElodieBouyer/indexation

    def __init__(self):
        try:
            self.extract = utilsXapian.Extract()
            self.indexer = xapian.TermGenerator()
            stemmer = xapian.Stem("french")
            self.indexer.set_stemmer(stemmer)
            self.fichier = ""
            self.database = xapian.WritableDatabase()

        except Exception, e:
            print >> sys.stderr, "Exception: %s" % str(e)
            sys.exit(1)

Example #26

0

Show file

    def __init__(self, tfidf_path, strict=True):
        self.db = xapian.Database(tfidf_path)

        queryparser = xapian.QueryParser()
        queryparser.set_stemmer(xapian.Stem("en"))
        queryparser.set_stemming_strategy(queryparser.STEM_SOME)
        # Start of prefix configuration.
        queryparser.add_prefix("title", "S")
        queryparser.add_prefix("description", "XD")

        self.queryparser = queryparser
        self.enquire = xapian.Enquire(self.db)

Example #27

0

Show file

File: xapianutils_bibrank_indexer.py Project: chokribr/inveniotest

def xapian_init_databases():
    """
    Initializes all database objects.
    """
    xapian_ensure_db_dir(XAPIAN_DIR_NAME)
    for field in INDEXES:
        xapian_ensure_db_dir(XAPIAN_DIR_NAME + "/" + field)
        database = xapian.WritableDatabase(XAPIAN_DIR + "/" + field, xapian.DB_CREATE_OR_OPEN)
        indexer = xapian.TermGenerator()
        stemmer = xapian.Stem("english")
        indexer.set_stemmer(stemmer)
        DATABASES[field] = (database, indexer)

Example #28

0

Show file

def search(dbpath, title, offset=0, pagesize=10):
    print '{}'.format(title)
    regex = re.findall(r'\w+', title.lower())
    print 'REGEX: {}'.format(regex)
    queryAND = ' AND '.join(regex)
    queryOR = ' OR '.join(regex)
    for querystring in [queryAND.encode('utf-8'), queryOR.encode('utf-8')]:
        print 'QUERY: {}'.format(querystring)
        # offset - defines starting point within result set
        # pagesize - defines number of records to retrieve

        # Open the database we're going to search.
        db = xapian.Database(dbpath)

        # Set up a QueryParser with a stemmer and suitable prefixes
        queryparser = xapian.QueryParser()
        queryparser.set_stemmer(xapian.Stem("en"))
        queryparser.set_stemming_strategy(queryparser.STEM_SOME)
        # Start of prefix configuration.
        queryparser.add_prefix("TITLE", "T")
        queryparser.add_prefix("BODY", "B")
        queryparser.add_prefix("ID", "I")
        queryparser.add_prefix("CLASS", "C")
        # End of prefix configuration.

        # And parse the query
        query = queryparser.parse_query(querystring)

        # Use an Enquire object on the database to run the query
        enquire = xapian.Enquire(db)
        enquire.set_query(query)

        # And print out something about each match
        matches = []
        for match in enquire.get_mset(offset, pagesize):
            fields = json.loads(match.document.get_data())
            # if title == fields.get('TITLE', u''):
            #     print 'This is \n'
            print(
                u"%(rank)i: #%(docid)3.3i \n %(id)s \n %(title)s \n %(body)s \n %(class)s \n"
                % {
                    'rank': match.rank + 1,
                    'docid': match.docid,
                    'id': fields.get('ID', u''),
                    'title': fields.get('TITLE', u''),
                    'body': fields.get('BODY', u''),
                    'class': fields.get('CLASS', u'')
                })
            matches.append(match.docid)

        # Finally, make sure we log the query and displayed results
        log_matches(querystring, offset, pagesize, matches)

Example #29

0

Show file

File: views.py Project: GunioRobot/django-twfy

def search(request):
    searchdb = settings.XAPIAN_DB
    database = xapian.Database(searchdb)
    enquire = xapian.Enquire(database)
    if request.GET.get('s'):
        query_string = request.GET.get('s')
    else:
        query_string = ''

    response = HttpResponse(mimetype='text/html')

    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.add_boolean_prefix('speaker', 'S')
    qp.add_boolean_prefix('major', 'M')
    qp.add_boolean_prefix('date', 'D')
    qp.add_boolean_prefix('batch', 'B')
    qp.add_boolean_prefix('segment', 'U')
    qp.add_boolean_prefix('department', 'G')
    qp.add_boolean_prefix('party', 'P')
    qp.add_boolean_prefix('column', 'C')
    qp.add_boolean_prefix('gid', 'Q')

    query = qp.parse_query(query_string)

    enquire.set_query(query)
    matches = enquire.get_mset(0, 20)

    class Hit:
        pass

    resultset = []
    for m in matches:
        current = Hit()
        current.rank = m.rank + 1
        current.docid = m.docid
        current.path = m.document.get_data()
        current.percent = m.percent

        resultset.append(current)

    t = loader.get_template('parliament/search.html')
    c = Context({
        'resultcount': matches.get_matches_estimated(),
        'results': resultset,
    })
    response.write(t.render(c))
    return response

Example #30

0

Show file

File: views.py Project: yash170106065/appstore

def _init_xapian_search():
    global Xapian_Enquires
    Xapian_Enquires = {}
    for model_name, model in inspect.getmembers(models, inspect.isclass):
        db_dir = os.path.join(XAPIAN_INDICES_DIR, model_name)
        if not os.path.exists(db_dir): continue
        db = xapian.Database(db_dir)
        enquire = xapian.Enquire(db)
        qp = xapian.QueryParser()
        qp.set_stemmer(xapian.Stem('english'))
        qp.set_database(db)
        qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
        Xapian_Enquires[model] = (db, enquire, qp)