Ejemplo n.º 1
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:', doc.get('title')
            print 'url:', doc.get('url')
            print 'src:', doc.get('src')
Ejemplo n.º 2
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command 
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:',doc.get('title')
            print 'url:',doc.get('url')
            print 'src:',doc.get('src')
 def get_or_query(self, queries):
     """Creates an OR Boolean query from multiple Lucene queries """
     # empty boolean query with Similarity.coord() disabled
     bq = BooleanQuery(False)
     for q in queries:
         bq.add(q, BooleanClause.Occur.SHOULD)
     return bq
Ejemplo n.º 4
0
    def more_like_this(self, film, count=4):
        """
        Use query by document techniques to find related documents
        :param film: film
        :param count: number of results
        :return: a list of related films
        """
        # Retrieve doc id of the given film
        film_query = TermQuery(Term('id', str(film.film_id)))
        results = self.searcher.search(film_query, 1)
        if results.totalHits != 1:
            return []

        # Use MoreLikeThis query by document technology
        mlt = MoreLikeThis(reader)
        mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"])
        mlt.setMinTermFreq(0)
        mlt.setMinDocFreq(0)
        mlt.setAnalyzer(self.analyzer)
        mlt_query = mlt.like(results.scoreDocs[0].doc)

        # Filter the original film
        filtered_query = BooleanQuery()
        filtered_query.add(mlt_query, BooleanClause.Occur.MUST)
        filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT)
        score_docs = self.searcher.search(filtered_query, count).scoreDocs

        return self._retrieve_in_order(score_docs)
Ejemplo n.º 5
0
def run_pic(valueFromOut, searcher, analyzer):
    command = valueFromOut

    seg_list = jieba.cut(command)
    command = " ".join(seg_list)
    if command == '':
        return

    result = []
    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 10).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        partResult = {}

        partResult['title'] = doc.get('title')
        partResult['url'] = doc.get('url')
        partResult['imgurl'] = doc.get('imgurl')

        result.append(partResult)

    return result
Ejemplo n.º 6
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))
        
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():            
            if(k=='site'):
                t = Term('url','*'+v.strip()+'*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:',doc.get('title'),
            print 'url:',doc.get('url')
Ejemplo n.º 7
0
def search_kw(kw, mode):
    vm_env.attachCurrentThread()
    lists = []
    l = jieba.cut(kw)
    query = BooleanQuery()
    for i in l:
        ii = QueryParser(Version.LUCENE_CURRENT, "introduction",
                         analyzer).parse(i)
        query.add(ii, BooleanClause.Occur.MUST)
    if mode:
        sf = SortField("score", SortField.Type.STRING, True)
        s = Sort(sf)
    else:
        sf = SortField("comments", SortField.Type.FLOAT, True)
        s = Sort(sf)
    scoreDocs = searcher1.search(query, 20, s).scoreDocs
    for scoreDoc in scoreDocs:
        movie = []
        doc = searcher1.doc(scoreDoc.doc)
        ####
        movie.append(doc.get("url"))
        movie.append(doc.get("picture"))
        movie.append(doc.get("title"))
        movie.append(doc.get("score"))
        movie.append(doc.get("genre"))
        movie.append(doc.get("stars"))
        movie.append(doc.get("comments"))
        #####
        lists.append(movie)

    return lists
Ejemplo n.º 8
0
def func1(genre, year):
    vm_env.attachCurrentThread()
    lists = []
    query = BooleanQuery()
    if genre != "111":
        item = QueryParser(Version.LUCENE_CURRENT, "genre",
                           analyzer).parse(genre)
        query.add(item, BooleanClause.Occur.MUST)
    if year != "111":
        item = QueryParser(Version.LUCENE_CURRENT, "year",
                           analyzer).parse(year)
        query.add(item, BooleanClause.Occur.MUST)
    sf = SortField("score", SortField.Type.STRING, True)
    s = Sort(sf)
    scoreDocs = searcher1.search(query, 20, s).scoreDocs
    for scoreDoc in scoreDocs:
        movie = []
        doc = searcher1.doc(scoreDoc.doc)
        movie.append(doc.get("url"))
        movie.append(doc.get("picture"))
        movie.append(doc.get("title"))
        movie.append(doc.get("score"))
        movie.append(doc.get("genre"))
        movie.append(doc.get("stars"))
        movie.append(doc.get("comments"))
        lists.append(movie)
    return lists
Ejemplo n.º 9
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'utf-8')
        if command == '':
            return

        print "Searching for:", command
        
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            print k, v
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'name:', doc.get("name")
            print 'title:', doc.get('title')
            print 'url:', doc.get("url")
Ejemplo n.º 10
0
def main():
    _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    query = BooleanQuery()
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    query.add(TermQuery(Term('type', 'user')), BooleanClause.Occur.MUST)
    i = 0
    with zh_iatd.create_searcher() as searcher:
        with open('pagerank_data.txt', 'w') as fout:
            reslst = searcher.searcher.search(query, 100)
            initval = 1.0 / reslst.totalHits
            while len(reslst.scoreDocs) > 0:
                for x in reslst.scoreDocs:
                    realdoc = searcher.searcher.doc(x.doc)
                    obj = document_to_obj(realdoc)
                    if not obj.data.followed_users is None:
                        print '{0:8}'.format(i), '  user', obj.index, len(
                            obj.data.followed_users)
                        fout.write('{0}\t{1}\t{2}\n'.format(
                            obj.index, initval, ' '.join(
                                (x.encode('utf8')
                                 for x in obj.data.followed_users))))
                    else:
                        print '{0:8}'.format(i), 'I user', obj.index
                    i += 1
                reslst = searcher.searcher.searchAfter(reslst.scoreDocs[-1],
                                                       query, 100)
Ejemplo n.º 11
0
    def search(**kwargs):
        vm_env.attachCurrentThread()
        query = BooleanQuery() 

        print("Searched keywords:")
        for field_name, keywords in kwargs.items():
            # assert field_name in SearchConfig.searchable_fields

            # keywords = list(filter(None, jieba.cut(keywords, cut_all=True)))
            keywords = list(filter(None, (k.strip() for k in jieba.cut_for_search(keywords))))
            
            for kw in keywords:
                print(kw)

            # construct query
            for kw in keywords:
                q = QueryParser(Version.LUCENE_CURRENT, field_name, analyzer).parse(kw)
                query.add(q, BooleanClause.Occur.SHOULD)

            if field_name == 'keywords':
                for kw in keywords:
                    q = QueryParser(Version.LUCENE_CURRENT, 'ent_name', analyzer).parse(kw)
                    query.add(q, BooleanClause.Occur.SHOULD)

        # search
        scoreDocs = searcher.search(query, 50).scoreDocs

        return [retrieve(searcher.doc(scoreDoc.doc)) for scoreDoc in scoreDocs]
Ejemplo n.º 12
0
    def ch_seach(self,
                 command_dict,
                 target_range=None,
                 targets=('title', 'author', 'text', 'likes', 'imgurl',
                          'label')):
        res = []

        querys = BooleanQuery()
        for key, value in command_dict.items():
            if key not in ['author', 'title', 'label', 'content']:
                continue
            query = QueryParser(Version.LUCENE_CURRENT, key,
                                self.Analyzer).parse(utils.jieba_seg(value[0]))
            if value[1]:
                querys.add(query, BooleanClause.Occur.MUST)
            else:
                querys.add(query, BooleanClause.Occur.SHOULD)
        totalDocs = self.chSearcher.search(querys, utils.MAX_RESULTS).scoreDocs

        total_match = len(totalDocs)
        if target_range is None:
            scoreDocs = totalDocs[:]
        else:
            scoreDocs = totalDocs[max(0, int(target_range[0])
                                      ):min(total_match, int(target_range[1]))]
        del totalDocs

        for i, scoreDoc in enumerate(scoreDocs):
            doc = self.chSearcher.doc(scoreDoc.doc)
            res.append({key: doc.get(key) for key in targets})

        return total_match, res
Ejemplo n.º 13
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command)

        if command == '':
            return

        command_dict = parseCommand(command)

        seg_list = jieba.cut(command_dict['contents'])
        command_dict['contents'] = (" ".join(seg_list))
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        print
        print "Searching for:", command

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for i, scoreDoc in enumerate(scoreDocs):
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), \
                '\nname:', doc.get("name"), \
                '\ntitle:', doc.get("title"), \
                "url:",doc.get("url"), \
                "\nsite:",doc.get("site"), "\n"
Ejemplo n.º 14
0
 def get_or_query(self, queries):
     """Creates an OR Boolean query from multiple Lucene queries."""
     # empty boolean query with Similarity.coord() disabled
     bq = BooleanQuery(False)
     for q in queries:
         bq.add(q, BooleanClause.Occur.SHOULD)
     return bq
Ejemplo n.º 15
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return
        print
        print "Searching for:", command

        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs

        finalDocTitles = []
        for i, scoreDoc in enumerate(scoreDocs):
            doc = searcher.doc(scoreDoc.doc)
            if (doc.get("title") not in finalDocTitles):
                print 'title:', doc.get("title"), 'url:', doc.get(
                    "url"), 'score:', scoreDoc.score, 'contents:', doc.get(
                        'contents')
                finalDocTitles.append(doc.get("title"))
            # print 'explain:', searcher.explain(query, scoreDoc.doc)
        print "%s total matching documents." % len(finalDocTitles)
Ejemplo n.º 16
0
def run_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index2"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    querys = BooleanQuery()
    query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent",
                                analyzer).parse(command)
    query_title = QueryParser(Version.LUCENE_CURRENT, "title",
                              analyzer).parse(command)
    querys.add(query_content, BooleanClause.Occur.SHOULD)
    querys.add(query_title, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        print "WARNING: No result"
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print doc.get("title")
        data = {}
        data['title'] = doc.get('title')
        data['url'] = doc.get('url')
        data['imgurl'] = doc.get('imgurl')
        result.append(data)
    return result
Ejemplo n.º 17
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            print "=== [ QUIT ] ==="
            return

        print
        print "Searching for:", command

        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            if 'contents' == k:
                v = " ".join(jieba.cut(v))
            if DEBUG_MODE:
                print k, v
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, MAX_ITEMS_PER_PAGE).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for idx, scoreDoc in enumerate(scoreDocs):
            doc = searcher.doc(scoreDoc.doc)
            # # explanation = searcher.explain(query, scoreDoc.doc)
            print "-- #", str(idx + 1), "--"
            print '\ttitle:\t', doc.get("title")
            print '\turl:\t', doc.get("url")
            print '\tpath:\t', doc.get("path")
            print '\tname:\t', doc.get("name")
            print
Ejemplo n.º 18
0
def do_mapping(line):
    regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line)
    if not regex:
        raise ValueError(line)
    netflix_id = int(regex.group("netflix_id"))

    title = QueryParser.escape(regex.group("title"))
    query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title)

    year = regex.group("year")
    if year == "NULL":
        scoreDocs = searcher.search(query1, 1).scoreDocs
    else:
        year = int(year)

        query2 = NumericRangeQuery.newIntRange("year", year, year, True, True)
        booleanQuery = BooleanQuery();
        booleanQuery.add(query1, BooleanClause.Occur.MUST);
        booleanQuery.add(query2, BooleanClause.Occur.MUST);

        scoreDocs = searcher.search(booleanQuery, 1).scoreDocs

    if scoreDocs:
        if scoreDocs[0].score > 1.5:
            doc = searcher.doc(scoreDocs[0].doc)
            doc_id = doc.getField("id").stringValue()
            doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES))
            writer.updateDocument(Term("id", doc_id), doc)
Ejemplo n.º 19
0
 def createDrilldownQuery(self, luceneQuery, drilldownQueries):
     q = BooleanQuery(True)
     if luceneQuery:
         q.add(luceneQuery, BooleanClause.Occur.MUST)
     for field, path in drilldownQueries:
         q.add(TermQuery(self._fieldRegistry.makeDrilldownTerm(field, path)), BooleanClause.Occur.MUST);
     return q
def text_search(command):
    envir.vm_env.attachCurrentThread()
    command_dict = parseCommand(command, "contents")
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            envir.analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs
    res = []

    query_highlight = QueryParser(Version.LUCENE_CURRENT, k,
                                  envir.analyzer).parse(command_dict["contents"])
    myhighlighter = Highlighter(
        SimpleHTMLFormatter(), QueryScorer(query_highlight))
    myhighlighter.setTextFragmenter(SimpleFragmenter(50))
    for scoreDoc in scoreDocs:
        # find texts which are around the keyword
        doc = envir.text_searcher.doc(scoreDoc.doc)
        text = doc.get("contents")
        key_text = "".join((myhighlighter.getBestFragments(
            envir.analyzer, "contents", text, 3)))
        key_text = re.sub('\s', '', key_text)
        temp = [doc.get("title"), doc.get('url'), key_text]
        res.append(temp)
    return res
	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		primary_key_update=False
		for key in toupdate.keys():
			if key in primary_keys_map:
				primary_key_update=True
				break
		if primary_key_update == True:
			query_search=BooleanQuery()
			for key in primary_keys_map:
				temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
				query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
			hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
			if len(hits) > 0:
				return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			temp=json.dumps(data)
			data_string=base64.b64encode(snappy.compress(temp))
		else:
			temp=json.dumps(data)
			data_string=base64.b64encode(temp)

		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
Ejemplo n.º 22
0
    def testFlat(self):

        q = BooleanQuery()
        q.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        self.assertEqual(1, self.search(q))
Ejemplo n.º 23
0
 def visitSCOPED_CLAUSE(self, node):
     clause = CqlVisitor.visitSCOPED_CLAUSE(self, node)
     if len(clause) == 1:
         return clause[0]
     lhs, operator, rhs = clause
     query = BooleanQuery()
     query.add(lhs, LHS_OCCUR[operator])
     query.add(rhs, RHS_OCCUR[operator])
     return query
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 25
0
 def visitSCOPED_CLAUSE(self, node):
     clause = CqlVisitor.visitSCOPED_CLAUSE(self, node)
     if len(clause) == 1:
         return clause[0]
     lhs, operator, rhs = clause
     query = BooleanQuery()
     query.add(lhs, LHS_OCCUR[operator])
     query.add(rhs, RHS_OCCUR[operator])
     return query
Ejemplo n.º 26
0
def firstsearch(searcher, analyzer, command):
    if len(command.split()) > 1:
        return []
    querys = BooleanQuery()
    query = QueryParser(Version.LUCENE_CURRENT, "name_not_cut",
                        analyzer).parse(command)
    querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 1000).scoreDocs
    return scoreDocs
Ejemplo n.º 27
0
def search_dianping(province, kind, query):
    STORE_DIR = "index"
    vm_env.attachCurrentThread()
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    allowed_opt = ['food', 'foodshop']

    if kind not in allowed_opt:
        return None
    if query == '':
        return None

    command = '%s:%s province:%s' % (kind, query, province)
    command = unicode(command, 'utf8', 'ignore')
    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    #比较评分
    max_rank = 0
    best_shop = ''
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        cur_shop = doc.get("foodshop").split()[-1]
        cur_rank = float(doc.get('rank'))
        if cur_rank > max_rank:
            max_rank = cur_rank
            best_shop = cur_shop

    result = {}
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        cur_shop = doc.get("foodshop").split()[-1]
        cur_rank = float(doc.get('rank'))

        if cur_rank == max_rank:
            result['name'] = cur_shop.encode('utf8', 'ignore')
            result['rank'] = doc.get('rank').encode('utf8', 'ignore')
            result['food'] = doc.get('food').encode('utf8', 'ignore')
            result['location'] = doc.get('location').encode('utf8', 'ignore')
            result['tel'] = doc.get('tel').encode('utf8', 'ignore')
            result['environment_score'] = doc.get('environment_score').encode(
                'utf8', 'ignore')
            result['flavour_score'] = doc.get('flavour_score').encode(
                'utf8', 'ignore')
            result['service_score'] = doc.get('service_score').encode(
                'utf8', 'ignore')
            result['price_level'] = doc.get('price_level').encode(
                'utf8', 'ignore')

    del searcher
    return result
Ejemplo n.º 28
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."

        command = raw_input("Query:")
        #command = unicode(command, 'GBK')
        command = unicode(command, 'utf8')
        if command == '':
            return

        print
        print 'searching for : ' + command
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        #比较评分
        max_rank = 0
        best_shop = ''
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            cur_shop = doc.get("foodshop").split()[-1]
            cur_rank = float(doc.get('rank'))
            if cur_rank > max_rank:
                max_rank = cur_rank
                best_shop = cur_shop
        result = {}
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            cur_shop = doc.get("foodshop").split()[-1]
            cur_rank = float(doc.get('rank'))

            if cur_rank == max_rank:
                result['name'] = cur_shop.encode('utf8', 'ignore')
                result['rank'] = doc.get('rank').encode('utf8', 'ignore')
                result['food'] = doc.get('food').encode('utf8', 'ignore')
                result['location'] = doc.get('location').encode(
                    'utf8', 'ignore')
                result['tel'] = doc.get('tel').encode('utf8', 'ignore')
                result['environment_score'] = doc.get(
                    'environment_score').encode('utf8', 'ignore')
                result['flavour_score'] = doc.get('flavour_score').encode(
                    'utf8', 'ignore')
                result['service_score'] = doc.get('service_score').encode(
                    'utf8', 'ignore')
                result['price_level'] = doc.get('price_level').encode(
                    'utf8', 'ignore')
        print result
Ejemplo n.º 29
0
 def testCollectScoresWithNoResultAndBooleanQueryDoesntFailOnFakeScorerInAggregateScoreCollector(self):
     q = BooleanQuery()
     q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD)
     q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD)
     q = ComposedQuery('coreA', query=q)
     q.start = 0
     q.stop = 0
     q.setRankQuery(core='coreC', query=luceneQueryFromCql('S=true'))
     q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX+'A'), dict(core='coreC', key=KEY_PREFIX+'C'))
     result = returnValueFromGenerator(self.dna.any.executeComposedQuery(q))
     self.assertEquals(4, result.total)
     self.assertEquals([], result.hits)
Ejemplo n.º 30
0
def do_query(property, qstring, limit = 10):
    query = BooleanQuery()
    stream = analyzer.tokenStream(property, StringReader(qstring))
    stream.reset()
    attr = stream.getAttribute(CharTermAttribute)

    while stream.incrementToken():
        term = attr.toString()
        termQuery = TermQuery(Term(property, term))
        query.add(termQuery, Occur.SHOULD)

    hits = searcher.search(query, None, limit).scoreDocs
    return [Document(searcher.doc(hit.doc)) for hit in hits]
    def testOutOfOrderDocsScoringSort(self):
        """
        Two Sort criteria to instantiate the multi/single comparators.
        """

        sorts = [Sort(SortField.FIELD_DOC), Sort()]

        tfcOptions = [[False, False, False],
                      [False, False, True],
                      [False, True, False],
                      [False, True, True],
                      [True, False, False],
                      [True, False, True],
                      [True, True, False],
                      [True, True, True]]

        actualTFCClasses = [
            "OutOfOrderOneComparatorNonScoringCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorNonScoringCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector" 
        ]
    
        bq = BooleanQuery()

        # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2
        # which delegates to BS if there are no mandatory clauses.
        bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)

        # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to
        # return the clause instead of BQ.
        bq.setMinimumNumberShouldMatch(1)

        for sort in sorts:
            for tfcOption, actualTFCClass in izip(tfcOptions,
                                                  actualTFCClasses):
                tdc = TopFieldCollector.create(sort, 10, tfcOption[0],
                                               tfcOption[1], tfcOption[2],
                                               False)

                self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass))
          
                self.full.search(bq, tdc)
          
                tds = tdc.topDocs()
                sds = tds.scoreDocs  
                self.assertEqual(10, len(sds))
Ejemplo n.º 32
0
    def testUnqualifiedTermFields(self):
        composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings())
        ast = parseCql("value")
        result = composer.compose(ast)
        query = BooleanQuery()
        left = TermQuery(Term("field0", "value"))
        left.setBoost(0.2)
        query.add(left, BooleanClause.Occur.SHOULD)

        right = TermQuery(Term("field1", "value"))
        right.setBoost(2.0)
        query.add(right, BooleanClause.Occur.SHOULD)

        self.assertEquals(type(query), type(result))
        self.assertEquals(repr(query), repr(result))
Ejemplo n.º 33
0
def func2(name):
    vm_env.attachCurrentThread()
    lists = []
    query = BooleanQuery()

    item = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(name)
    query.add(item, BooleanClause.Occur.MUST)
    scoreDocs = searcher2.search(query, 20).scoreDocs
    for scoreDoc in scoreDocs:
        list = []
        doc = searcher2.doc(scoreDoc.doc)
        list.append(doc.get("picture"))
        list.append(doc.get("url"))
        list.append(doc.get("name"))
        lists.append(list)
    return lists
def delete(primary_keys_map,collection_name,todelete,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	try:
		tofind_keyvalue_pairs=json.loads(todelete)
	except:
		return 100	
	

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	try:
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
		ireader=IndexReader.open(direc)
	except:
		return 105

	###as of now deletion of documents support is only based on indexed keys.###################3 
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents according to primary keys		
	query=BooleanQuery()
	for key in tofind_primary_keyvalue_pairs.keys():
		temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
		query.add(BooleanClause(temp,BooleanClause.Occur.MUST))

	a=writer.deleteDocuments(query)
	if commit==True:
		writer.commit()
	writer.close()
	return 000;
Ejemplo n.º 35
0
def delete(primary_keys_map, collection_name, todelete, commit=False):
    INDEX_DIR_DEFAULT = "IndexFiles.index"
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    try:
        tofind_keyvalue_pairs = json.loads(todelete)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    try:
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
        ireader = IndexReader.open(direc)
    except:
        return 105

    ###as of now deletion of documents support is only based on indexed keys.###################3
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents according to primary keys
    query = BooleanQuery()
    for key in tofind_primary_keyvalue_pairs.keys():
        temp = QueryParser(Version.LUCENE_CURRENT, key,
                           analyzer).parse(tofind_primary_keyvalue_pairs[key])
        query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

    a = writer.deleteDocuments(query)
    if commit == True:
        writer.commit()
    writer.close()
    return 000
Ejemplo n.º 36
0
def search_trip(command):
    '''command must be encoded in unicode'''
    STORE_DIR = "index_trip"
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))

    folders = {
        'parsed_ctrip':
        ['source', 'location', 'introduction', 'score', 'img_list'],
        'parsed_qunar':
        ['location', 'rank', 'score', 'time', 'introduction', 'img_list'],
        'eic_mfw': ['location', 'introduction', 'img_list']
    }
    readers = constructReaders(folders)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    print 'total: %s' % (len(scoreDocs))

    maxf = []
    maxrank = -1000.0
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        filename = doc.get('filename')
        rank = ranker(command_dict, getInfo(folders, readers, filename))
        if rank > maxrank:
            maxf = [filename]
            maxrank = rank
        elif rank == maxrank:
            maxf.append(filename)

    del searcher

    if len(maxf) == 0:
        print "error in searchtrip.py: no result while searching", command_dict.get(
            'location', '')
        return "Interior Error"
    elif len(maxf) != 1:
        print "warning in searchtrip.py: multiple results when searching", command_dict.get(
            'location', '')
    return getInfo(folders, readers, maxf[0])
Ejemplo n.º 37
0
    def visitSEARCH_CLAUSE(self, node):
        # possible children:
        # CQL_QUERY
        # SEARCH_TERM
        # INDEX, RELATION, SEARCH_TERM
        firstChild = node.children[0].name
        results = CqlVisitor.visitSEARCH_CLAUSE(self, node)
        if firstChild == 'SEARCH_TERM':
            (unqualifiedRhs, ) = results
            if unqualifiedRhs == '*':
                return MatchAllDocsQuery()
            subQueries = []
            for fieldname, boost in self._unqualifiedTermFields:
                subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs)
                if isinstance(
                        subQuery, PhraseQuery
                ) and not self._fieldRegistry.phraseQueryPossible(fieldname):
                    continue
                subQuery.setBoost(boost)
                subQueries.append(subQuery)
            if len(subQueries) == 1:
                query = subQueries[0]
            else:
                query = BooleanQuery()
                for subQuery in subQueries:
                    query.add(subQuery, BooleanClause.Occur.SHOULD)
            return query
        elif firstChild == 'INDEX':
            (left, (relation, boost), right) = results
            if relation in [
                    '==', 'exact'
            ] or (relation == '=' and self._fieldRegistry.isUntokenized(left)):
                query = TermQuery(self._createTerm(left, right))
            elif relation == '=':
                query = self._termOrPhraseQuery(left, right)
            elif relation in ['<', '<=', '>=', '>']:
                query = self._termRangeQuery(left, relation, right)
            else:
                raise UnsupportedCQL("'%s' not supported for the field '%s'" %
                                     (relation, left))

            query.setBoost(boost)
            return query
        else:
            ((query, ), ) = results
            return query
Ejemplo n.º 38
0
    def testParenthesisMust2(self):

        q3 = BooleanQuery()
        q3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        q3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q4 = BooleanQuery()
        q4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        q4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        q2 = BooleanQuery()
        q2.add(q3, BooleanClause.Occur.SHOULD)
        q2.add(q4, BooleanClause.Occur.MUST)
        self.assertEqual(1, self.search(q2))
Ejemplo n.º 39
0
def run(searcher, analyzer, command):

    commandsplit = command.split()
    maxlen = len(commandsplit[0])
    maxindex = 0
    for i in range(len(commandsplit)):
        if maxlen < len(commandsplit[i]):
            maxlen = len(commandsplit[i])
            maxindex = i
    commands = " ".join(jieba.cut(command.split()[maxindex])).split()
    querys = BooleanQuery()
    for i in commands:
        try:
            query = QueryParser(Version.LUCENE_CURRENT, "name",
                                analyzer).parse(i)
            querys.add(query, BooleanClause.Occur.MUST)
        except:
            continue
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        querys = BooleanQuery()
        for i in commands:
            for j in i:
                try:
                    query = QueryParser(Version.LUCENE_CURRENT, "not_seg",
                                        analyzer).parse(j)
                    querys.add(query, BooleanClause.Occur.MUST)
                except:
                    continue
        scoreDocs = searcher.search(querys, 50).scoreDocs
    temp = []
    if len(scoreDocs) > 0:
        doc = searcher.doc(scoreDocs[0].doc)
        temp = [
            doc.get("org"),
            doc.get("path"),
            doc.get("price"),
            doc.get("imgsrc")
        ]
    else:
        temp = ['unknown'] * 4
    return temp
Ejemplo n.º 40
0
    def visitSEARCH_CLAUSE(self, node):
        # possible children:
        # CQL_QUERY
        # SEARCH_TERM
        # INDEX, RELATION, SEARCH_TERM
        firstChild = node.children[0].name
        results = CqlVisitor.visitSEARCH_CLAUSE(self, node)
        if firstChild == 'SEARCH_TERM':
            (unqualifiedRhs,) = results
            if unqualifiedRhs == '*':
                return MatchAllDocsQuery()
            subQueries = []
            for fieldname, boost in self._unqualifiedTermFields:
                subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs)
                if isinstance(subQuery, PhraseQuery) and not self._fieldRegistry.phraseQueryPossible(fieldname):
                    continue
                subQuery.setBoost(boost)
                subQueries.append(subQuery)
            if len(subQueries) == 1:
                query = subQueries[0]
            else:
                query = BooleanQuery()
                for subQuery in subQueries:
                    query.add(subQuery, BooleanClause.Occur.SHOULD)
            return query
        elif firstChild == 'INDEX':
            (left, (relation, boost), right) = results
            if relation in ['==', 'exact'] or (relation == '=' and self._fieldRegistry.isUntokenized(left)):
                query = TermQuery(self._createTerm(left, right))
            elif relation == '=':
                query = self._termOrPhraseQuery(left, right)
            elif relation in ['<','<=','>=','>']:
                query = self._termRangeQuery(left, relation, right)
            else:
                raise UnsupportedCQL("'%s' not supported for the field '%s'" % (relation, left))

            query.setBoost(boost)
            return query
        else:
            ((query,),) = results
            return query
Ejemplo n.º 41
0
def lucene_sample_query_parse(sampleq, ftypes):
    fields = []
    queries = []
    booleans = []
    bq = BooleanQuery()
    for query_tuple in sampleq:
        (field, op_, value) = re.split(snapconf.RANGE_QUERY_OPS, query_tuple)
        m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple)
        if m is None or field is None:
            continue
        op=m.group(1)
        if op not in snapconf.operators:
            sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op)))
            sys.exit(-1)
        field_w_type = snapconf.SAMPLE_HEADER_FIELDS_TYPE_MAP[field]
        (fieldtypechar, ftype_method) = ftypes[field_w_type]
        #range query
        if fieldtypechar == 'i' or fieldtypechar == 'f':
            bq.add(lucene_range_query_parse(field_w_type, op, value, fieldtypechar, ftype_method), BOOLEAN_OCCUR)
        #phrase query
        elif ' ' in value or '\t' in value:
            pquery = PhraseQuery()
            [pquery.add(Term(field_w_type, v.lower())) for v in re.split(r'\s+',value)]
            #force exact phrase matching only
            pquery.setSlop(0)
            bq.add(pquery, BOOLEAN_OCCUR)
        #term query
        else:
            bq.add(TermQuery(Term(field_w_type, value.lower())), BOOLEAN_OCCUR)
        sys.stderr.write("value + fields: %s %s\n" % (value.lower(), field_w_type))
    return bq
Ejemplo n.º 42
0
def run(searcher, analyzer, command):
    command_dict = parseCommand(command)
    seg_list = jieba.cut(command_dict['contents'])
    command_dict['contents'] = (" ".join(seg_list))
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 50).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    scorer = QueryScorer(query)
    fragmenter = SimpleSpanFragmenter(scorer, 250)
    simpleHTMLFormatter = SimpleHTMLFormatter("<b>", "</b>")
    highlighter = Highlighter(simpleHTMLFormatter, scorer)
    highlighter.setTextFragmenter(fragmenter)

    results = []

    for i, scoreDoc in enumerate(scoreDocs):
        doc = searcher.doc(scoreDoc.doc)
        contents = doc.get("contents")
        if contents:
            tkStream = analyzer.tokenStream("contents", contents)
            highlight = highlighter.getBestFragment(tkStream, contents)
            highlightseg = highlight.split()
            highlight = ''.join(highlightseg)
            results.append(
                (doc.get("title").strip(), doc.get("url"), highlight))
        '''
        print 'path:', doc.get("path"), \
            '\nname:', doc.get("name"), \
            '\ntitle:', doc.get("title"), \
            "url:",doc.get("url"), \
            "\nsite:",doc.get("site"),\
            "\ncontent:",highlight,"\n"
        '''
        # print 'explain:', searcher.explain(query, scoreDoc.doc)
    return results
Ejemplo n.º 43
0
    def _create_query(self, fields):
        """
        Build query with Term, Phrase and Fuzzy clauses.
        :param fields: dictionary of (field, text) tuples
        :return: query
        """
        query = BooleanQuery()
        for (field, text) in fields:
            if field.startswith("year"):
                start, end = text.split(",")
                numeric_query = NumericRangeQuery.newIntRange(
                    'year', int(start), int(end), True, True)
                query.add(BooleanClause(numeric_query, BooleanClause.Occur.MUST))
            if field == 'title':
                spans = []
                for word in text.lower().split():
                    spans.append(SpanTermQuery(Term(field, word)))
                query.add(BooleanClause(SpanNearQuery(spans, 2, True), BooleanClause.Occur.SHOULD))

        field_names, field_texts = zip(*fields)
        flags = [BooleanClause.Occur.MUST] * len(field_names)

        query_parser_query = MultiFieldQueryParser.parse(
            Version.LUCENE_CURRENT,
            field_texts,
            field_names,
            flags,
            StandardAnalyzer(Version.LUCENE_CURRENT))
        query.add(BooleanClause(query_parser_query, BooleanClause.Occur.MUST))

        fuzzify = lambda s: (s + " ").replace(" ", "~1 ")
        fuzzy_field_texts = map(fuzzify, field_texts)

        fuzzy_query_parser_query = MultiFieldQueryParser.parse(
            Version.LUCENE_CURRENT,
            fuzzy_field_texts,
            field_names,
            flags,
            StandardAnalyzer(Version.LUCENE_CURRENT))
        query.add(BooleanClause(fuzzy_query_parser_query, BooleanClause.Occur.MUST))

        boostQuery = FunctionQuery(
            LinearFloatFunction(
                PowFloatFunction(
                    DoubleConstValueSource(0.0001),
                    ScaleFloatFunction(IntFieldSource("imdb_votes_boost"), 0.0, 1.0)
                ), -1.0, 1.0))
        query = CustomScoreQuery(query, boostQuery)

        return query
Ejemplo n.º 44
0
    def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
Ejemplo n.º 45
0
def lucene_range_query_parse(query_string):
    '''parse the user's range query string into something pylucene can understand'''
    query = BooleanQuery()
    queries_ = query_string.split(snapconf.RANGE_QUERY_DELIMITER)
    start = None
    end = None
    start_inclusive = True
    end_inclusive = True
    for query_tuple in queries_:
        m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple)
        (col,op_,val)=re.split(snapconf.RANGE_QUERY_OPS,query_tuple)
        if not m or not col or col not in snapconf.TABIX_DBS or col not in snapconf.LUCENE_TYPES:
            continue
        op=m.group(1)
        if op not in snapconf.operators:
            sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op)))
            sys.exit(-1)
        (ltype,ptype,qtype) = snapconf.LUCENE_TYPES[col]
        rquery = None
        if ptype == str:
            rquery = TermQuery(qtype(col,str(val)))
        else:
            #assume operator == '='
            (start,end) = (ptype(val),ptype(val)) 
            if op == '>=':
                end = None 
            if op == '<=':
                start = None 
            if op == '<':
                start = None
                end_inclusive = False
            if op == '>':
                end = None
                start_inclusive = False
            rquery = qtype(col,start,end,start_inclusive,end_inclusive)
        query.add(rquery,BooleanClause.Occur.MUST)
        #sys.stderr.write("query + fields: %s %s\n" % (query,field))
    return query
Ejemplo n.º 46
0
    def extract_phrase_query(self, q, field, slop=0, boost=5):
        phrases = re.findall(r'"([^"]*)"', q)
        if len(phrases) == 0:
            return None, q

        q = re.sub(r'"([^"]*)"', "", q).strip()  # query without phrases
        if self.verbose:
            print "Detected phrases: ", phrases

        bq = BooleanQuery()
        for phrase in phrases:
            # pq = PhraseQuery()
            # for term in filter(None, phrase.split(' ')):
            #     pq.add(Term(field, term))
            qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer)
            # parse phrase - this may or may not be desired
            # pq = qparser.parse(field + ':"' + phrase + '"')
            pq = qparser.parse('%s "%s"~%d^%.1f' %
                               (phrase, phrase, slop, boost))
            # phrase queries have high priority
            bq.add(pq, BooleanClause.Occur.MUST)
            # bq.add(pq, BooleanClause.Occur.SHOULD)

        return bq, q
    def testBraces(self):
        self.assertConversion(TermQuery(Term('unqualified', 'cats')), '(cats)')
        innerQuery = BooleanQuery()
        innerQuery.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST)
        innerQuery.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST)
        outerQuery = BooleanQuery()
        outerQuery.add(innerQuery, BooleanClause.Occur.SHOULD)
        outerQuery.add(TermQuery(Term('unqualified', 'mice')), BooleanClause.Occur.SHOULD)

        self.assertConversion(outerQuery, '(cats AND dogs) OR mice')
Ejemplo n.º 48
0
 def _luceneQuery(self, prefix, sets=None, setsMask=None, partition=None):
     query = BooleanQuery()
     if prefix:
         query.add(TermQuery(Term(PREFIX_FIELD, prefix)), BooleanClause.Occur.MUST)
     if sets:
         setQuery = BooleanQuery()
         for setSpec in sets:
             setQuery.add(TermQuery(Term(SETS_FIELD, setSpec)), BooleanClause.Occur.SHOULD)
         query.add(setQuery, BooleanClause.Occur.MUST)
     for set_ in setsMask or []:
         query.add(TermQuery(Term(SETS_FIELD, set_)), BooleanClause.Occur.MUST)
     if partition:
         partitionQueries = []
         for start, stop in partition.ranges():
             partitionQueries.append(NumericRangeQuery.newIntRange(HASH_FIELD, start, stop, True, False))
         if len(partitionQueries) == 1:
             pQuery = partitionQueries[0]
         else:
             pQuery = BooleanQuery()
             for q in partitionQueries:
                 pQuery.add(q, BooleanClause.Occur.SHOULD)
         query.add(pQuery, BooleanClause.Occur.MUST)
     if query.clauses().size() == 0:
         query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
     return query
 def testBooleanOrTermOutput(self):
     query = BooleanQuery()
     query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.SHOULD)
     query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.SHOULD)
     self.assertConversion(query, 'cats OR dogs')
    def testEquality(self):

        bq1 = BooleanQuery()
        bq1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        bq1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        nested1 = BooleanQuery()
        nested1.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD)
        nested1.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD)
        bq1.add(nested1, BooleanClause.Occur.SHOULD)

        bq2 = BooleanQuery()
        bq2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        bq2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        nested2 = BooleanQuery()
        nested2.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD)
        nested2.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD)
        bq2.add(nested2, BooleanClause.Occur.SHOULD)
        
        self.assert_(bq1.equals(bq2))
 def testBooleanNotTermOutput(self):
     query = BooleanQuery()
     query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST)
     query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST_NOT)
     self.assertConversion(query, 'cats NOT dogs')
Ejemplo n.º 52
0
 def addDuplicatesQuery(self, query):
     not_duplicate = TermQuery(Term('duplicate', 'false'))
     booleanQuery = BooleanQuery()
     booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
     booleanQuery.add(query, BooleanClause.Occur.MUST)
     return booleanQuery
def  search(primary_keys_map,to_be_compressed_input,collection_name,tofind,MAX_RESULTS=1000):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT
	try:
		print "********" + tofind
		tofind_keyvalue_pairs=json.loads(tofind)
	except:
		return 100	
	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
	except:
		return 105

	#initializing return list 
	return_list=[]
	#check_list=[]
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents		
	if len(tofind_primary_keyvalue_pairs)>0:		
		query=BooleanQuery()
		for key in tofind_primary_keyvalue_pairs.keys():
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		for hit in hits:
			doc=searcher.doc(hit.doc)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					return_list.append(data)
			else:
				return_list.append(data)
			
	else:
		for i in range(0,ireader.numDocs()):
			doc=searcher.doc(i)
			if to_be_compressed_input==True:
				data=snappy.uncompress(str(doc.get("$DATA$")))
			else:
				data=doc.get("$DATA$")

				
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					return_list.append(data)
			else:
				return_list.append(data)
			
	ireader.close()

	if len(return_list)==0:
		return None	
	else:
		return return_list 
def get_query_results(reader,query,n,field):
    searcher = IndexSearcher(reader)
    hits = searcher.search(query, n).scoreDocs
    print("Found %d hits:" % len(hits))
    for i, hit in enumerate(hits):
        doc = searcher.doc(hit.doc)
        print("%d. %s" % (i + 1, doc.get(field)))

#### part(a)
query1a = TermQuery(Term("capital_html","greek"))
query2a = TermQuery(Term("capital_html","roman"))
query3a = TermQuery(Term("capital_html","persian"))

boolean_query_a = BooleanQuery()
boolean_query_a.add(query1a, BooleanClause.Occur.MUST)
boolean_query_a.add(query2a, BooleanClause.Occur.MUST)
boolean_query_a.add(query3a, BooleanClause.Occur.MUST_NOT)

get_query_results(reader,boolean_query_a,n_docs,"capital")

#Found 32 hits:
#1. https://en.wikipedia.org/wiki/Sukhumi
#2. https://en.wikipedia.org/wiki/Nicosia
#3. https://en.wikipedia.org/wiki/Nicosia
#4. https://en.wikipedia.org/wiki/Tiraspol
#5. https://en.wikipedia.org/wiki/Tripoli
#6. https://en.wikipedia.org/wiki/Tunis
#7. https://en.wikipedia.org/wiki/Lisbon
#8. https://en.wikipedia.org/wiki/Podgorica
#9. https://en.wikipedia.org/wiki/Cetinji
def update(primary_keys_map,to_be_compressed_input,collection_name,tofind,update,commit=False,add_field_if_not_exists=True):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	#As of now the update will be implemented as search,modify data in json file,delete and re-write
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT
	try:
		tofind_keyvalue_pairs=json.loads(tofind)
	except:
		return 100	
	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
		#setting writer configurations
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
	except:
		return 105
	no_of_documents_modified=0	
	#finding the document to update
	#Scope for making this more efficient
	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		query_search=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
		if len(hits) > 0:
			return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			data_string=snappy.compress(str(json.dumps(data)))
		else:
			data_string=json.dumps(data)	
		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)

	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents		
	if len(tofind_primary_keyvalue_pairs)>0:		
		query=BooleanQuery()
		for key in tofind_primary_keyvalue_pairs.keys():
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		
		for hit in hits:
			doc=searcher.doc(hit.doc)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					if rewrite(data)!=106:
						no_of_documents_modified+=1
					else:
						writer.rollback()
						return 106	
			else:
				if rewrite(data)!=106:
					no_of_documents_modified+=1
				else:
					writer.rollback()
					return 106
				
			
	else:
		for i in range(0,ireader.numDocs()):
			doc=searcher.doc(i)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					if rewrite(data)!=106:
						no_of_documents_modified+=1
					else:
						writer.rollback()
						return 106
			else:
				if rewrite(data)!=106:
					no_of_documents_modified+=1
				else:
					writer.rollback()
					return 106
			
	
	ireader.close()
	if commit==True:
			writer.commit()
	writer.close()
	return str(no_of_documents_modified)+" have been modified"
def store(primary_keys_map,to_be_compressed_input,collection_name,data,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT	
	print "started indexing input data......"
	
	#extracting values
	try:
		contents=json.loads(data)
	except:
		return 100


	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	

	#checking for existance of record with same primary_key set
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(contents[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		if len(hits) > 0:
			return 106
	except:
		pass 	 
	
	


	#setting writer configurations
	config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
	writer=IndexWriter(direc,config)
	#fix this later.....FieldType not defined
	#field_type=FieldType()
	#field_type.setIndexed(True)
	#field_type.setStored(False)
	#field_type.setTokenized(False)
	
	try:
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,contents[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			data=snappy.compress(data)
		field=Field("$DATA$",data,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
		if commit==True:
			writer.commit()
		writer.close()
		return 000
	except:
		return 102