Ejemplo n.º 1
0
    def __init__(self, path=None, name='master_timeline_weibo', stub=None, include_remote=False, schema=Schema, schema_version=SCHEMA_VERSION):
        def create(dbpath):
            return _database(dbpath)

        def merge(db1, db2):
            db1.add_database(db2)
            return db1

        if stub:
            # 如果是list,默认全部为文件
            if isinstance(stub, list):
                self.database = reduce(merge,
                                       map(_stub_database, stub))
            elif os.path.isfile(stub):
                self.database = _stub_database(stub)
            elif os.path.isdir(stub):
                self.database = reduce(merge,
                                       map(_stub_database, [os.path.join(stub, p) for p in os.listdir(stub)]))
        else:
            self.database = reduce(merge,
                                   map(create, [os.path.join(path, p) for p in os.listdir(path) if p.startswith('_%s' % name)]))

        self.schema = getattr(schema, 'v%s' % schema_version)
        enquire = xapian.Enquire(self.database)
        enquire.set_weighting_scheme(xapian.BoolWeight())  # 使用最简单的weight模型提升效率
        enquire.set_docid_order(xapian.Enquire.DONT_CARE)  # 不关心mset的顺序

        if 'collapse_valueno' in self.schema:
            enquire.set_collapse_key(self.schema['collapse_valueno'])

        self.enquire = enquire
        self.include_remote = include_remote
Ejemplo n.º 2
0
def select_weight(option):
    if option == 0:
        bm = xapian.BB2Weight(1.0)
    elif option == 1:
        bm = xapian.BM25PlusWeight(1.0, 0, 1.0, 0.5, 0.5, 1.0)
    elif option == 2:
        bm = xapian.BM25Weight(1.0, 0.0, 1.0, 0.5, 0.3)
    elif option == 3:
        bm = xapian.BoolWeight()
    elif option == 4:
        bm = xapian.CoordWeight()
    elif option == 5:
        bm = xapian.DLHWeight()  #maybe some problem
    elif option == 6:
        bm = xapian.DPHWeight()
    elif option == 7:
        bm = xapian.IfB2Weight(1)
    elif option == 8:
        bm = xapian.IneB2Weight(1)
    elif option == 9:
        bm = xapian.InL2Weight(1)
    elif option == 10:
        bm = xapian.LMWeight(
            0.0, 1, -1.0, -1.0)  #the second parameter is TWO_STAGE_SMOOTHING
    elif option == 11:
        bm = xapian.PL2PlusWeight(1, 0.8)
    elif option == 12:
        bm = xapian.PL2Weight(1)
    elif option == 13:
        bm = xapian.TfIdfWeight("ntn")
    elif option == 14:
        bm = xapian.TradWeight(1.0)

    return bm
Ejemplo n.º 3
0
	def Search(self, command, blacklist=[], include=['feeds','entries'], since=0):
		"""returns two lists, one of search results in feeds, and one for results in entries.  It
		is sorted so that title results are first, description results are second"""
		
		if not self._index_lock.acquire(False):
			#if we are indexing, don't try to search
			#print "wouldn't get lock"
			return ([],[])
		self._index_lock.release()
		
		
		database = xapian.Database(self._storeDir)
		enquire = xapian.Enquire(database)
		
		qp = xapian.QueryParser()
		stemmer = xapian.Stem("english")
		qp.set_stemmer(stemmer)
		qp.set_database(database)
		qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
		
		enquire.set_docid_order(xapian.Enquire.DESCENDING)
		enquire.set_weighting_scheme(xapian.BoolWeight())

		# Display the results.
		#print "%i results found." % matches.get_matches_estimated()
		#print "Results 1-%i:" % matches.size()

		#for m in matches:
		#    print "%i: %i%% docid=%i [%s] %s %s %s" % (m.rank + 1, m.percent, m.docid, m.document.get_data()[0:100], m.document.get_value(0), m.document.get_value(1), m.document.get_value(2))
		
		feed_results=[]
		entry_results=[]
		
		query = qp.parse_query(command)
		enquire.set_query(query)
		matches = enquire.get_mset(0, 100)
		for m in matches:
			doc = m.document
			feed_id = doc.get_value(FEED_ID)
			feed_id = int(feed_id)
			try:
				if feed_id not in blacklist:
					entry_id = doc.get_value(ENTRY_ID)
					if entry_id is '': # meaning this is actually a feed (we could know that from above, but eh)
						feed_results.append(int(feed_id))
					else: # meaning "entry"
						title    = doc.get_value(ENTRY_TITLE)
						fakedate = float(doc.get_value(DATE)) / 1000.0
						if fakedate > since:
							entry_results.append((int(entry_id),title, fakedate, feed_id))
				#else:
				#	print "excluding:"+doc.get("title")
			except Exception, e:
				print e
				print feed_id
				print blacklist
Ejemplo n.º 4
0
    def search(self, query=None, sort_by=None, start_offset=0,
               max_offset=None, fields=None, count_only=False, **kwargs):

        query = self.parse_query(query)

        if xapian.Query.empty(query):
            return 0, lambda: []

        database = self.database
        enquire = xapian.Enquire(database)
        enquire.set_weighting_scheme(xapian.BoolWeight())  # 使用最简单的weight模型提升效率
        enquire.set_docid_order(xapian.Enquire.DONT_CARE)  # 不关心mset的顺序
        enquire.set_query(query)

        if 'collapse_valueno' in self.schema:
            enquire.set_collapse_key(self.schema['collapse_valueno'])

        if count_only:
            return self._get_hit_count(database, enquire)

        if sort_by:
            self._set_sort_by(enquire, sort_by)

        if not max_offset:
            max_offset = database.get_doccount() - start_offset

        mset = self._get_enquire_mset(database, enquire, start_offset, max_offset)
        mset.fetch()  # 提前fetch,加快remote访问速度

        def result_generator():
            if fields is not None and set(fields) <= set(['terms']):
                for match in mset:  # 如果fields为[], 这情况下,不返回任何一项
                    item = {}
                    if 'terms' in fields:
                        item['terms'] = {term.term[5:]: term.wdf for term in match.document.termlist() if term.term.startswith('XTEXT')}
                    yield item
            else:
                for match in mset:
                    r = msgpack.unpackb(self._get_document_data(database, match.document))
                    if fields is not None:
                        item = {}
                        for field in fields:
                            if field == 'terms':
                                item['terms'] = {term.term[5:]: term.wdf for term in match.document.termlist() if term.term.startswith('XTEXT')}
                            else:
                                item[field] = r.get(field)
                    else:
                        item = r
                    yield item

        return mset.size(), result_generator
Ejemplo n.º 5
0
def XapLookup(query):
    import xapian

    xapian_file = "../../undata/xapdex.db/"  #sys.argv[1]
    xapian_db = xapian.Database(xapian_file)
    xapian_enquire = xapian.Enquire(xapian_db)

    xapian_query = xapian.QueryParser()
    xapian_query.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
    xapian_query.set_default_op(xapian.Query.OP_AND)
    xapian_query.add_boolean_prefix("id", "I")
    xapian_query.add_boolean_prefix("subid", "J")
    xapian_query.add_boolean_prefix("class", "C")
    xapian_query.add_boolean_prefix("name", "S")
    xapian_query.add_boolean_prefix("nation", "N")
    xapian_query.add_boolean_prefix("language", "L")
    xapian_query.add_boolean_prefix("document", "D")
    xapian_query.add_boolean_prefix("reference", "R")
    xapian_query.add_boolean_prefix("date", "E")
    xapian_query.add_boolean_prefix("agenda", "A")
    xapian_query.add_boolean_prefix("vote", "V")
    xapian_query.add_boolean_prefix("session", "Z")

    # Stop words in scraper/xapdex.py must match those here
    xapian_stopper = xapian.SimpleStopper()
    xapian_stopper.add('the')
    for letter1 in range(ord('a'), ord('z')):
        xapian_stopper.add(chr(letter1))
        for letter2 in range(ord('a'), ord('z')):
            xapian_stopper.add(chr(letter1) + chr(letter2))
    xapian_query.set_stopper(xapian_stopper)

    parsed_query = xapian_query.parse_query(query,
                                            16 + 4 + 2 + 1)  # allows wildcards
    #print "desc:", parsed_query.get_description()

    xapian_enquire.set_query(parsed_query)
    xapian_enquire.set_sort_by_value(0, xapian.Enquire.ASCENDING)
    xapian_enquire.set_weighting_scheme(xapian.BoolWeight())

    # do sorting etc. here

    matches = xapian_enquire.get_mset(0,
                                      500)  # XXX 500 as constant is dodgy here
    res = []
    # print "matches", matches.size()
    for match in matches:
        #print match[4].get_value(0), match[4].get_data()
        res.append(match[4].get_data())
    return res
Ejemplo n.º 6
0
def search(query, active_element, numresults=10):
    # Put active_element at end of query
    qfields = sorted(query, key=lambda k: k == active_element)
    # XXX There  should be a way  to do this without  going through an
    # intermediate string, and without adding prefixes.
    qvalues = [(k, e) for k in qfields for e in query[k].split()]
    qstring = ['%s:%s' % (field, value) for field, value in qvalues]
    querystring = ' AND '.join(qstring)
    db = xapian.Database(dbpath)
    queryparser = xapian.QueryParser()
    queryparser.set_database(db)
    for field, abbrev in fields.items():
        queryparser.add_prefix(field, abbrev)
    query = queryparser.parse_query(querystring,
                                    queryparser.FLAG_BOOLEAN |
                                    queryparser.FLAG_PARTIAL |
                                    queryparser.FLAG_WILDCARD)
    enquire = xapian.Enquire(db)
    enquire.set_weighting_scheme(xapian.BoolWeight())
    enquire.set_query(query)
    return [json.loads(r.document.get_data())
            for r in enquire.get_mset(0, numresults)]
Ejemplo n.º 7
0
xapian_query.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
xapian_query.set_default_op(xapian.Query.OP_AND)
xapian_query.add_boolean_prefix("id", "I")
xapian_query.add_boolean_prefix("subid", "J")
xapian_query.add_boolean_prefix("class", "C")
xapian_query.add_boolean_prefix("name", "S")
xapian_query.add_boolean_prefix("nation", "N")
xapian_query.add_boolean_prefix("language", "L")
xapian_query.add_boolean_prefix("document", "D")
xapian_query.add_boolean_prefix("reference", "R")
xapian_query.add_boolean_prefix("date", "E")
xapian_query.add_boolean_prefix("agenda", "A")
xapian_query.add_boolean_prefix("vote", "V")
xapian_query.add_boolean_prefix("session", "Z")

parsed_query = xapian_query.parse_query(query,
                                        16 + 4 + 2 + 1)  # allows wildcards
print "desc:", parsed_query.get_description()

xapian_enquire.set_query(parsed_query)
xapian_enquire.set_sort_by_value(0, xapian.Enquire.ASCENDING)
xapian_enquire.set_weighting_scheme(xapian.BoolWeight())

# do sorting etc. here

matches = xapian_enquire.get_mset(0, 500)
print matches.size()
for match in matches:
    #print match
    print match[4].get_value(0), match[4].get_data()