def search(self, topic):

        query = self.query_parser.parse(topic.title)
        results = self.searcher.search(query, self.top_n)

        score_pairs = {} 
        for hit in results.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            for field in ["title","heading", "text"]:
                terms = doc.get(field).split()
                for term in terms:
                    if (field, term) in score_pairs:
                        score_pairs[(field,term)].increment()
                    else:
                        score_pairs[(field,term)] = ScorePair(self.reader, field, term) # XXX

        top_terms = score_pairs.values()
        top_terms.sort(key=lambda x: x.score(), reverse=True)
        top_terms = top_terms[:25]

        # print([term.term for term in top_terms])


        bq = BooleanQuery() 
        query.setBoost(float(10000000))
        bq.add(query, BooleanClause.Occur.SHOULD)
        for score_pair in top_terms:
            term = score_pair.to_term()
            bq.add(TermQuery(term), BooleanClause.Occur.SHOULD)

        return self.searcher.search(bq, 5000)
def run1(searcher, analyzer,target,distance,comein):
        print '&&&&&&&&&&&&&&&&&&&&&&&&&&'
        print target
        print [comein[1]]

        flit = kind_fliter.kind_fliter(comein[1].decode('utf8'))
        if (not flit[0]):
            segment = jieba.cut(flit[1])
            command = " ".join(segment).replace("\n","").decode('utf-8')
            #print "Searching for:", command
            query_s = QueryParser(Version.LUCENE_CURRENT, "search",
                            analyzer).parse(command)
        else:
            query_s = QueryParser(Version.LUCENE_CURRENT, "kind",
                            analyzer).parse(flit[1])
        #scoreDocs = searcher.search(query_s, 50).scoreDocs
        #print target
        cal_tar_x = int(target[0]*100000)
        cal_tar_y = int(target[1]*100000)
        #print cal_tar_x
        radius = distance*A*100000
        #print radius
        cal_radius = int(radius)
        query1 = NumericRangeQuery.newFloatRange("calx",cal_tar_x-cal_radius,cal_tar_x+cal_radius,True,True)
        query2 = NumericRangeQuery.newFloatRange("caly",cal_tar_y-cal_radius,cal_tar_y+cal_radius,True,True)

        query = BooleanQuery()
        query.add(query1, BooleanClause.Occur.MUST)
        query.add(query2, BooleanClause.Occur.MUST)
        query.add(query_s, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(query, 500).scoreDocs
        return scoreDocs,query
Esempio n. 3
0
def run(searcher, analyzer):
    while True:
        
        querys=BooleanQuery()       
        print
        print "Hit enter with no input to quit."
##        for i in range(searcher.maxDoc()):
##                doc=searcher.doc(i)
##                print doc.get('singername')
        
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        print
##        command=" ".join(jieba.cut(command))
        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT,"singername",analyzer).parse(command)
        querys.add(query,BooleanClause.Occur.SHOULD)        
##        query = QueryParser(Version.LUCENE_CURRENT,"albumname",analyzer).parse(command)
##        querys.add(query,BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 3).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'singername:', doc.get("singername"), '\n','singerplace:',doc.get('singerplace'), '\n',\
                  'singerintro:',doc.get('singerintro'),'\n','singeralbums:', doc.get("singeralbums"),'\n',\
                   'singeralbumURLs:',doc.get('singeralbumURLs'),'\n','singerpicURL:', doc.get("singerpicURL"),'\n'
Esempio n. 4
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        #command = 'Christian author:mark twain title:autobiography language:English'
        command = unicode(command, 'GBK')
        if command == '':
            return

        print
        print "Searching for:", command

        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            ##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'name:', doc.get("name")
            print 'title:', doc.get('title')
            print 'author:', doc.get('author')
            print 'language:', doc.get('language')
Esempio n. 5
0
    def main(cls, argv):

        allBooks = MatchAllDocsQuery()
        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        query = BooleanQuery()
        query.add(allBooks, BooleanClause.Occur.SHOULD)
        query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD)

        indexDir = System.getProperty("index.dir")
        directory = SimpleFSDirectory(File(indexDir))

        example = SortingExample(directory)

        example.displayResults(query, Sort.RELEVANCE)
        example.displayResults(query, Sort.INDEXORDER)
        example.displayResults(query,
                               Sort(SortField("category", SortField.STRING)))
        example.displayResults(query,
                               Sort(SortField("pubmonth", SortField.INT, True)))

        example.displayResults(query,
                               Sort([SortField("category", SortField.STRING),
                                     SortField.FIELD_SCORE,
                                     SortField("pubmonth", SortField.INT, True)]))

        example.displayResults(query,
                               Sort([SortField.FIELD_SCORE,
                                     SortField("category", SortField.STRING)]))
        directory.close()
def run(command, pageindex,pagesize):
    global searcher,analyzer

    print "Searching for:", command 
    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k,v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            analyzer).parse(v)

        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 6000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    res = []
    for scoreDoc in scoreDocs[start:end+1]:
        doc = searcher.doc(scoreDoc.doc)
        r = []
        r.append(doc.get('title'))
        r.append(doc.get('url'))
        r.append(doc.get('src'))
        r.append(doc.get('alt').replace(' ',''))
        res.append(r)
    return res,len(scoreDocs)
Esempio n. 7
0
def run(searcher, analyzer):
    for i in range(searcher.maxDoc()):
        doc=searcher.doc(i)
        print doc.get('songname')
    while True:
        
        querys=BooleanQuery()       
        print
        print "Hit enter with no input to quit."

        
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        print

        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT,"songname",analyzer).parse(command)
        querys.add(query,BooleanClause.Occur.SHOULD)        
##        query = QueryParser(Version.LUCENE_CURRENT,"albumname",analyzer).parse(command)
##        querys.add(query,BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 3).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'songname:', doc.get("songname")
Esempio n. 8
0
def run(command,searcher, analyzer):
    #while True:
        #print
        #print "Hit enter with no input to quit."
        #command = raw_input("Query:")
        #command = unicode(command,'gbk')
        command = command.decode('utf-8')
        if command == '':
            return

        #print
        #print "Searching for:",
        querys = BooleanQuery()
        for i in jieba.cut(command):
            #print i,
            query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(i)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs

        #print "\n%s total matching documents." % len(scoreDocs)
        list1 = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            list1.append(doc)
        return list1
Esempio n. 9
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command 
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:',doc.get('title')
            print 'url:',doc.get('url')
            print 'src:',doc.get('src')
Esempio n. 10
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:', doc.get('title')
            print 'url:', doc.get('url')
            print 'src:', doc.get('src')
Esempio n. 11
0
    def testFilterAlternative(self):

        categoryQuery = TermQuery(Term("category", "/philosophy/eastern"))

        constrainedQuery = BooleanQuery()
        constrainedQuery.add(self.allBooks, BooleanClause.Occur.MUST)
        constrainedQuery.add(categoryQuery, BooleanClause.Occur.MUST)

        scoreDocs = self.searcher.search(constrainedQuery, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "only tao te ching")
    def testAnd(self):

        searchingBooks = TermQuery(Term("subject", "search"))
        books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401), Integer(200412), True, True)

        searchingBooks2004 = BooleanQuery()
        searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST)
        searchingBooks2004.add(books2004, BooleanClause.Occur.MUST)

        searcher = self.getSearcher()
        scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
Esempio n. 13
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
    def testOr(self):

        methodologyBooks = TermQuery(Term("category", "/technology/computers/programming/methodology"))
        easternPhilosophyBooks = TermQuery(Term("category", "/philosophy/eastern"))

        enlightenmentBooks = BooleanQuery()
        enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD)
        enlightenmentBooks.add(easternPhilosophyBooks, BooleanClause.Occur.SHOULD)

        searcher = self.getSearcher()
        scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs
        print "or =", enlightenmentBooks

        self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained")
        self.assertHitsIncludeTitle(searcher, scoreDocs, u"Tao Te Ching \u9053\u5FB7\u7D93")
Esempio n. 15
0
    def testAgainstOR(self):

        quickFox = PhraseQuery()
        quickFox.setSlop(1)
        quickFox.add(Term("field", "quick"))
        quickFox.add(Term("field", "fox"))

        fastFox = PhraseQuery()
        fastFox.add(Term("field", "fast"))
        fastFox.add(Term("field", "fox"))

        query = BooleanQuery()
        query.add(quickFox, BooleanClause.Occur.SHOULD)
        query.add(fastFox, BooleanClause.Occur.SHOULD)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits)
Esempio n. 16
0
def run(command,pageindex=1,pagesize=15):
    global searcher, analyzer,old_command,old_res_list
    global STORE_DIR,directory,searcher,analyzer
    if command == '':
        return

    print "Searching for:", command  
    
    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k,v in command_dict.iteritems():            
        if(k=='site'):
            t = Term('url','*'+v.strip()+'*')
            query = WildcardQuery(t)

        else:
            query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    
    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    
    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>")
    queryToHigh = QueryParser(Version.LUCENE_CURRENT,"lrc",analyzer).parse(command_dict['content'])
    hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex-1)*pagesize
    end = start+pagesize
    print start,end
    for scoreDoc in scoreDocs[start:end+10]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('url'))
        res.append(doc.get('music_name'))
        res.append(doc.get('artist'))
        res.append(doc.get('album_name'))
        res.append(doc.get('lrc'))
        output = hlter.getBestFragment(analyzer,"lrc",clear(doc.get('lrc')))
        res.append(output)
        res.append(doc.get('musicID'))
        if(res[5]!=None):
            res_list.append(res) 
        if(len(res_list)==8):
            break
    return res_list,len(scoreDocs)
Esempio n. 17
0
def run(command, pageindex=1, pagesize=15):
    global searcher, analyzer, old_command, old_res_list
    global STORE_DIR, directory, searcher, analyzer
    if command == '':
        return

    print "Searching for:", command  #朱莉与茱莉娅

    # final = jieba.cut(command)
    # query = QueryParser(Version.LUCENE_CURRENT, "contents",
    #                     analyzer).parse(' '.join(final))

    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k, v in command_dict.iteritems():
        if (k == 'site'):
            t = Term('url', '*' + v.strip() + '*')
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>",
                                              "</font>")

    queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                              analyzer).parse(command_dict['contents'])

    hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    for scoreDoc in scoreDocs[start:end + 1]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('title'))
        res.append(doc.get('url'))
        output = hlter.getBestFragment(analyzer, "contents",
                                       clear(doc.get('contents')))
        res.append(output)
        res_list.append(res)
    return res_list, len(scoreDocs)
Esempio n. 18
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command  #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))

        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            if (k == 'site'):
                t = Term('url', '*' + v.strip() + '*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,
                                    analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs

        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>",
                                                  "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:', doc.get('title'),
            print 'url:', doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer, "contents", ori_text)
            print output
Esempio n. 19
0
    def testFilteredQuery(self):
        
        isbns = ["0854402624"]  # Steiner

        accessor = TestSpecialsAccessor(isbns)
        filter = SpecialsFilter(accessor)

        educationBooks = WildcardQuery(Term("category", "*education*"))
        edBooksOnSpecial = FilteredQuery(educationBooks, filter)

        logoBooks = TermQuery(Term("subject", "logo"))

        logoOrEdBooks = BooleanQuery()
        logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD)
        logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD)

        topDocs = self.searcher.search(logoOrEdBooks, 50)
        print logoOrEdBooks
        self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
Esempio n. 20
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))
        
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():            
            if(k=='site'):
                t = Term('url','*'+v.strip()+'*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs
        
        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:',doc.get('title'),
            print 'url:',doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer,"contents",ori_text)
            print output
Esempio n. 21
0
def run3(searcher, analyzer,command):
    querys=BooleanQuery()

    if command == '':
        return False

    query = QueryParser(Version.LUCENE_CURRENT,"singername",analyzer).parse(command)
    querys.add(query,BooleanClause.Occur.SHOULD)        
    doc=None
    scoreDocs = searcher.search(querys, 1).scoreDocs
    if len(scoreDocs)==1:
        doc = searcher.doc(scoreDocs[0].doc)
        print 12345,doc.get('singername')
        if doc.get('singername')!=command:
            return False
    else:
        return False

    return doc
def run2(searcher, analyzer,target,distance,comein):
        print [comein[1]]
        flit = kind_fliter.kind_fliter(comein[1])
        #print flit[1].decode('utf8')
        query_s = []
        if (not flit[0]):
            #print 11111111111
            segment = jieba.cut(flit[1])
            command = " ".join(segment).replace("\n","")
            #print "Searching for:", command
            query_s = QueryParser(Version.LUCENE_CURRENT, "search",
                            analyzer).parse(command)

        else:
            query_s = QueryParser(Version.LUCENE_CURRENT, "kind",
                            analyzer).parse(flit[1])
        query = BooleanQuery()
        query.add(query_s, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(query, 500).scoreDocs
        return scoreDocs,query
def run3(searcher, analyzer,target,distance,comein):
        #print '&&&&&&&&&&&&&&&&&&&&&&&&&&'
        #print target
        print [comein[1]]
        distance = 5000
        cal_tar_x = int(target[0]*100000)
        cal_tar_y = int(target[1]*100000)
        #print cal_tar_x
        radius = distance*A*100000
        #print radius
        cal_radius = int(radius)
        query1 = NumericRangeQuery.newFloatRange("calx",cal_tar_x-cal_radius,cal_tar_x+cal_radius,True,True)
        query2 = NumericRangeQuery.newFloatRange("caly",cal_tar_y-cal_radius,cal_tar_y+cal_radius,True,True)

        query = BooleanQuery()
        query.add(query1, BooleanClause.Occur.MUST)
        query.add(query2, BooleanClause.Occur.MUST)
        #query.add(query_s, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(query, 500).scoreDocs
        return scoreDocs,query
Esempio n. 24
0
    def testToString(self):

        query = BooleanQuery()
        query.add(FuzzyQuery(Term("field", "kountry")),
                  BooleanClause.Occur.MUST)
        query.add(TermQuery(Term("title", "western")),
                  BooleanClause.Occur.SHOULD)

        self.assertEqual("+kountry~0.5 title:western", query.toString("field"),
                         "both kinds")
Esempio n. 25
0
def run(searcher, analyzer, command, prior):
    if command == '':
        return

    store = []

    command_dict = parseCommand(command)
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 500000).scoreDocs

    scored = []

    for scoreDoc in scoreDocs:
        returnfile = []
        doc = searcher.doc(scoreDoc.doc)

        if doc.get("qst_num") in scored:
            continue
        if not doc.get("qst_name"):
            continue
        scored.append(doc.get("qst_num"))

        name = doc.get("qst_name").replace(' ', '')
        returnfile.append(name)
        detail = doc.get("qst_detail").replace(' ', '')
        returnfile.append(detail)

        returnfile.append(doc.get("qst_topic_accu"))
        returnfile.append(int(doc.get("qst_browse")))
        returnfile.append(int(doc.get("qst_follow")))
        returnfile.append(int(doc.get("qst_ans")))
        returnfile.append(int(doc.get("qst_num")))

        store.append(returnfile)

    store = storesort(store, prior)
    return store
    def search(self, topic):

        query = self.query_parser.parse(topic.title)
        results = self.searcher.search(query, self.top_n)

        score_pairs = {}
        for hit in results.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            for field in ["title", "heading", "text"]:
                terms = doc.get(field).split()
                for term in terms:
                    if (field, term) in score_pairs:
                        score_pairs[(field, term)].increment()
                    else:
                        score_pairs[(field, term)] = ScorePair(
                            self.reader, field, term)  # XXX

        top_terms = score_pairs.values()
        top_terms.sort(key=lambda x: x.score(), reverse=True)
        top_terms = top_terms[:25]

        # print([term.term for term in top_terms])

        bq = BooleanQuery()
        query.setBoost(float(10000000))
        bq.add(query, BooleanClause.Occur.SHOULD)
        for score_pair in top_terms:
            term = score_pair.to_term()
            bq.add(TermQuery(term), BooleanClause.Occur.SHOULD)

        return self.searcher.search(bq, 5000)
Esempio n. 27
0
    def testToString(self):

        query = BooleanQuery()
        query.add(FuzzyQuery(Term("field", "kountry")), BooleanClause.Occur.MUST)
        query.add(TermQuery(Term("title", "western")), BooleanClause.Occur.SHOULD)

        self.assertEqual("+kountry~0.5 title:western", query.toString("field"), "both kinds")
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if len(command) > 0:
            dll=cdll.LoadLibrary("F:\\ICTCLAS50_Windows_32_C\ICTCLAS50.dll")
            dll.ICTCLAS_Init(c_char_p("F:\\ICTCLAS50_Windows_32_C"))
            strlen = len(c_char_p(command).value)
            t =c_buffer(strlen*6)
            bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(command),c_int(strlen),t,c_int(0),0)
            command=t.value.decode('gbk').encode('utf8')
            ##list=t.value.split()
            ##print ' '.join(list)
            dll.ICTCLAS_Exit()
            command=command.decode('utf8')
        if command == '':
            return

        print
        print "Searching for:", command

        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k,v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'name:', doc.get("name")
            print 'title:', doc.get('title')
            print 'url:', doc.get('url')
Esempio n. 29
0
File: searcher.py Progetto: fay/wt
 def search(self, query,category_id=None):
     SHOULD = BooleanClause.Occur.SHOULD
     #MultiFieldQueryParser.setOperator(QueryParser.DEFAULT_OPERATOR_AND);
     parser1 = QueryParser('summary',self.analyzer)
     parser2 = QueryParser('title',self.analyzer)        
     parser1.setDefaultOperator(QueryParser.AND_OPERATOR)
     parser2.setDefaultOperator(QueryParser.AND_OPERATOR)
     q1 = parser1.parse(query)
     q2 = parser2.parse(query)
     boolQuery = BooleanQuery()
     boolQuery.add(q1,SHOULD)
     boolQuery.add(q2,SHOULD)
     
     #camp = CategoryComparatorSource(query)
     #sortfield = SortField("link", camp)
     #sort = Sort(sortfield)
     if category_id:
         self.catfilter.query = query
         self.catfilter.category_id = category_id
         hits = self.searcher.search(boolQuery,self.catfilter)
     else:
         hits = self.searcher.search(boolQuery)
     return hits
Esempio n. 30
0
def run2(searcher, analyzer,command,num):
    querys=BooleanQuery()

    if command == '':
        return False

    query = QueryParser(Version.LUCENE_CURRENT,"albumnum",analyzer).parse(command)
    querys.add(query,BooleanClause.Occur.SHOULD)        
    query = QueryParser(Version.LUCENE_CURRENT,"albumname",analyzer).parse(command)
    querys.add(query,BooleanClause.Occur.SHOULD)
    doc=None
    scoreDocs = searcher.search(querys, num).scoreDocs
    if num>1:
        return scoreDocs
    if len(scoreDocs)==1:
        doc = searcher.doc(scoreDocs[0].doc)
        print 555,doc.get('albumname')
        if doc.get('albumname')!=command and doc.get('albumnum')!=command:
            return False
    else:
        return False

    return doc
Esempio n. 31
0
def run(searcher, analyzer, command):
    while True:
        if command == '':
            return
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 300).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        text = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            temptext = [
                doc.get("url"),
                doc.get('title'),
                doc.get("imgurl"),
                doc.get("price"),
                doc.get("kind")
            ]
            text.append(temptext)
        return text
Esempio n. 32
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        command_dict = parseCommand(command)
        sep_command = " ".join(jieba.cut(command_dict['contents']))
        command_dict['contents'] = sep_command
        #print command_dict
        if not command_dict.has_key('site'):
            command = command_dict['contents']
        else:
            command = command_dict['contents'] + " site:" + command_dict['site']
        print
        print "Searching for:", command

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            print k, v
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print "------------------------"
            #print 'site:', doc.get("site")
            print 'path:', doc.get("path")
            print 'title:', doc.get("title")
            print 'url:', doc.get("url")
            print 'name:', doc.get("name")
Esempio n. 33
0
def run(searcher, analyzer,command):
    querys=BooleanQuery()

    if command == '':
        return False
    query = QueryParser(Version.LUCENE_CURRENT,"songartist",analyzer).parse(command)
    querys.add(query,BooleanClause.Occur.SHOULD)
    query = QueryParser(Version.LUCENE_CURRENT,"songname",analyzer).parse(command)
    querys.add(query,BooleanClause.Occur.SHOULD)        
    query = QueryParser(Version.LUCENE_CURRENT,"songalbum",analyzer).parse(command)
    querys.add(query,BooleanClause.Occur.SHOULD)

    scoreDocs = searcher.search(querys, 20).scoreDocs

    return scoreDocs
Esempio n. 34
0
    def testAnd(self):

        searchingBooks = TermQuery(Term("subject", "search"))
        books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401),
                                                  Integer(200412), True, True)

        searchingBooks2004 = BooleanQuery()
        searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST)
        searchingBooks2004.add(books2004, BooleanClause.Occur.MUST)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
Esempio n. 35
0
    def testAgainstOR(self):

        quickFox = PhraseQuery()
        quickFox.setSlop(1)
        quickFox.add(Term("field", "quick"))
        quickFox.add(Term("field", "fox"))

        fastFox = PhraseQuery()
        fastFox.add(Term("field", "fast"))
        fastFox.add(Term("field", "fox"))

        query = BooleanQuery()
        query.add(quickFox, BooleanClause.Occur.SHOULD)
        query.add(fastFox, BooleanClause.Occur.SHOULD)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits)
Esempio n. 36
0
    def testFilteredQuery(self):

        isbns = ["0854402624"]  # Steiner

        accessor = TestSpecialsAccessor(isbns)
        filter = SpecialsFilter(accessor)

        educationBooks = WildcardQuery(Term("category", "*education*"))
        edBooksOnSpecial = FilteredQuery(educationBooks, filter)

        logoBooks = TermQuery(Term("subject", "logo"))

        logoOrEdBooks = BooleanQuery()
        logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD)
        logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD)

        topDocs = self.searcher.search(logoOrEdBooks, 50)
        print logoOrEdBooks
        self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
Esempio n. 37
0
    def testOr(self):

        methodologyBooks = TermQuery(
            Term("category", "/technology/computers/programming/methodology"))
        easternPhilosophyBooks = TermQuery(
            Term("category", "/philosophy/eastern"))

        enlightenmentBooks = BooleanQuery()
        enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD)
        enlightenmentBooks.add(easternPhilosophyBooks,
                               BooleanClause.Occur.SHOULD)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs
        print "or =", enlightenmentBooks

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Extreme Programming Explained")
        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    u"Tao Te Ching \u9053\u5FB7\u7D93")
Esempio n. 38
0
    def docsLike(self, id, doc, max):

        authors = doc.getValues("author")
        authorQuery = BooleanQuery()
        for author in authors:
            authorQuery.add(TermQuery(Term("author", author)),
                            BooleanClause.Occur.SHOULD)
        authorQuery.setBoost(2.0)

        vector = self.reader.getTermFreqVector(id, "subject")

        subjectQuery = BooleanQuery()
        for term in vector.getTerms():
            tq = TermQuery(Term("subject", term))
            subjectQuery.add(tq, BooleanClause.Occur.SHOULD)

        likeThisQuery = BooleanQuery()
        likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD)
        likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD)

        # exclude myself
        likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))),
                          BooleanClause.Occur.MUST_NOT)

        print "  Query:", likeThisQuery.toString("contents")
        scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            if len(docs) < max:
                docs.append(doc)
            else:
                break

        return docs
Esempio n. 39
0
    def docsLike(self, id, doc, max):

        authors = doc.getValues("author")
        authorQuery = BooleanQuery()
        for author in authors:
            authorQuery.add(TermQuery(Term("author", author)),
                            BooleanClause.Occur.SHOULD)
        authorQuery.setBoost(2.0)

        vector = self.reader.getTermFreqVector(id, "subject")

        subjectQuery = BooleanQuery()
        for term in vector.getTerms():
            tq = TermQuery(Term("subject", term))
            subjectQuery.add(tq, BooleanClause.Occur.SHOULD)

        likeThisQuery = BooleanQuery()
        likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD)
        likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD)

        # exclude myself
        likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))),
                          BooleanClause.Occur.MUST_NOT)

        print "  Query:", likeThisQuery.toString("contents")
        scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs

        docs = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            if len(docs) < max:
                docs.append(doc)
            else:
                break

        return docs
    def searchDocuments(self, view, version, query=None, attribute=None):

        store = self.store

        if query is None:
            query = MatchAllDocsQuery()
        else:
            query = QueryParser("contents", StandardAnalyzer()).parse(query)

        if attribute:
            combinedQuery = BooleanQuery()
            combinedQuery.add(query, BooleanClause.Occur.MUST)
            combinedQuery.add(TermQuery(Term("attribute", attribute.str64())),
                              BooleanClause.Occur.MUST)
            query = combinedQuery

        class _collector(PythonHitCollector):
            def __init__(_self):

                super(_collector, _self).__init__()
                _self.hits = []

            def collect(_self, id, score):

                _self.hits.append((-score, id))

        class _iterator(object):
            def __init__(_self):

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __del__(_self):

                try:
                    if _self.searcher is not None:
                        _self.searcher.close()
                    store.abortTransaction(view, _self.txnStatus)
                except:
                    store.repository.logger.exception("in __del__")

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __iter__(_self):

                _self.txnStatus = store.startTransaction(view)
                _self.searcher = searcher = self.getIndexSearcher()
                _self.collector = _collector()

                searcher.search(query, _self.collector)
                hits = _self.collector.hits

                if hits:
                    heapify(hits)
                    while hits:
                        score, id = heappop(hits)
                        doc = searcher.doc(id)
                        uItem = UUID(doc['item'])

                        if long(doc['version']) <= version:
                            if store._items.isValue(view, version, uItem,
                                                    UUID(doc['value'])):
                                yield uItem, UUID(doc['attribute'])

        return _iterator()
Esempio n. 41
0
    def searchDocuments(self, view, version, query=None, attribute=None):

        store = self.store

        if query is None:
            query = MatchAllDocsQuery()
        else:
            query = QueryParser("contents", StandardAnalyzer()).parse(query)
        
        if attribute:
            combinedQuery = BooleanQuery()
            combinedQuery.add(query, BooleanClause.Occur.MUST)
            combinedQuery.add(TermQuery(Term("attribute", attribute.str64())),
                              BooleanClause.Occur.MUST)
            query = combinedQuery

        class _collector(PythonHitCollector):

            def __init__(_self):

                super(_collector, _self).__init__()
                _self.hits=[]

            def collect(_self, id, score):

                _self.hits.append((-score, id))
        
        class _iterator(object):

            def __init__(_self):

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __del__(_self):

                try:
                    if _self.searcher is not None:
                        _self.searcher.close()
                    store.abortTransaction(view, _self.txnStatus)
                except:
                    store.repository.logger.exception("in __del__")

                _self.txnStatus = 0
                _self.searcher = None
                _self.collector = None

            def __iter__(_self):

                _self.txnStatus = store.startTransaction(view)
                _self.searcher = searcher = self.getIndexSearcher()
                _self.collector = _collector()

                searcher.search(query, _self.collector)
                hits = _self.collector.hits

                if hits:
                    heapify(hits)
                    while hits:
                        score, id = heappop(hits)
                        doc = searcher.doc(id)
                        uItem = UUID(doc['item'])
                        
                        if long(doc['version']) <= version:
                            if store._items.isValue(view, version, uItem,
                                                    UUID(doc['value'])):
                                yield uItem, UUID(doc['attribute'])

        return _iterator()
def getBaselineStatistics(searcher, analyzer):
    baseline_stats_hash = {}
    day_one = time.strptime("01 01 2005", "%d %m %Y")
    day_one_ts = int(time.mktime(day_one))
    max_day_ctr = 1830
    day_ctr = 0
    while day_ctr < max_day_ctr:
        if day_ctr%100 == 0: print "on day ctr: ", day_ctr, " at time: ", time.time()
        curr_day_ts = day_one_ts + 86400*day_ctr
        next_day_ts = day_one_ts + 86400*(day_ctr+1)
        day_ctr+=1

        range_filter = NumericRangeFilter.newIntRange("timestamp", Integer(curr_day_ts), Integer(next_day_ts), True, True)
        
        #all tweets in day range
        all_docs_query = MatchAllDocsQuery()
        tweets_in_range_search = searcher.search(all_docs_query, range_filter)
        num_tweets_in_range = tweets_in_range_search.length()

        #all tweets in day range US
        US_tweets_base_query = MatchAllDocsQuery()
        #us_escape_one = QueryParser("country", analyzer).escape("United")
        #us_escape_two = 
        us_query = TermQuery(Term("country", "United States"))
        #us_query.add(Term("country","United"))
        #us_query.add(Term("country","States"))
        US_tweets_country_query = us_query
        #US_tweets_country_query = QueryParser("country", analyzer).parse(us_query)
        US_tweets_query_filter = QueryFilter(US_tweets_country_query)
        compound_filter_US_tweets = BooleanFilter()
        compound_filter_US_tweets.add(FilterClause(range_filter, BooleanClause.Occur.MUST))
        compound_filter_US_tweets.add(FilterClause(US_tweets_query_filter, BooleanClause.Occur.MUST))
        US_tweets_in_range_search = searcher.search(US_tweets_base_query, compound_filter_US_tweets)
        num_US_tweets_in_range = US_tweets_in_range_search.length()
        
        #all tweets in day range japan
        JP_tweets_base_query = MatchAllDocsQuery()
        JP_tweets_country_query = QueryParser("country", analyzer).parse("Japan")
        JP_tweets_query_filter = QueryFilter(JP_tweets_country_query)
        compound_filter_JP_tweets = BooleanFilter()
        compound_filter_JP_tweets.add(FilterClause(range_filter, BooleanClause.Occur.MUST))
        compound_filter_JP_tweets.add(FilterClause(JP_tweets_query_filter, BooleanClause.Occur.MUST))
        JP_tweets_in_range_search = searcher.search(JP_tweets_base_query, compound_filter_JP_tweets)
        num_JP_tweets_in_range = JP_tweets_in_range_search.length()
        #day_ctr%10 == 0: print "US tweets: ", num_US_tweets_in_range, " JP tweets: ", num_JP_tweets_in_range
        
        #all tweets containing emoticons
        empty_term = Term("emoticons")
        empty_term_prefix = PrefixQuery(empty_term)
        all_emoticons_docs_query_filter = QueryFilter(empty_term_prefix)
        compound_filter = BooleanFilter()
        compound_filter.add(FilterClause(range_filter, BooleanClause.Occur.MUST))
        compound_filter.add(FilterClause(all_emoticons_docs_query_filter, BooleanClause.Occur.MUST))
        emoticon_tweets_in_range_search = searcher.search(all_docs_query, compound_filter)
        num_emoticon_tweets_in_range = emoticon_tweets_in_range_search.length()

        #all tweets containing "http" or "https"
        bq = BooleanQuery()
        http_str = QueryParser.escape("http://")
        http_query = QueryParser("emoticons", analyzer).parse(http_str)
        https_str = QueryParser.escape("https://")
        https_query = QueryParser("emoticons", analyzer).parse(https_str)
        bq.add(http_query, BooleanClause.Occur.SHOULD)
        bq.add(https_query, BooleanClause.Occur.SHOULD)
        bq_search = searcher.search(bq, range_filter)
        num_http_emoticons = bq_search.length()
        
        baseline_stats_hash[day_ctr] = {'total tweets':num_tweets_in_range, 'emoticons':num_emoticon_tweets_in_range, 'http':num_http_emoticons, 'US tweets':num_US_tweets_in_range, \
                                        'JP tweets':num_JP_tweets_in_range}

    baseline_stats_text_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_stats.txt","w")
    raw_stats_list = sorted(baseline_stats_hash.items(), key = lambda x: int(x[0]))
    baseline_stats_text_file.write("day total emoticons http US JP\n")
    for rs in raw_stats_list: baseline_stats_text_file.write("%s %s %s %s %s %s\n" %(rs[0], rs[1]["total tweets"], rs[1]["emoticons"], rs[1]["http"], rs[1]['US tweets'], \
                                                             rs[1]['JP tweets']))
    baseline_stats_text_file.close()
    baseline_stats_file = open("/Volumes/TerraFirma/SharedData/vdb5/emoticons_raw_files/emoticon_stats.json","w")
    baseline_stats_file.write(json.dumps(baseline_stats_hash))
    baseline_stats_file.close()