Esempio n. 1
0
    def testLowercasing(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse("PrefixQuery*")
        self.assertEqual("prefixquery*", q.toString("field"), "lowercased")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setLowercaseExpandedTerms(False)
        q = qp.parse("PrefixQuery*")
        self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
Esempio n. 2
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field",
                        analyzer).parse('"This is Some Phrase*"')
        self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
Esempio n. 3
0
    def testSlop(self):

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"exact phrase"')
        self.assertEqual("\"exact phrase\"", q.toString("field"), "zero slop")

        qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer)
        qp.setPhraseSlop(5)
        q = qp.parse('"sloppy phrase"')
        self.assertEqual("\"sloppy phrase\"~5", q.toString("field"),
                         "sloppy, implicitly")
Esempio n. 4
0
    def testWithQueryParser(self):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            self.synonymAnalyzer).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        # in Lucene 1.9, position increments are no longer ignored
        self.assertEqual(1, topDocs.totalHits, "!!!! what?!")

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            StandardAnalyzer(
                                Version.LUCENE_CURRENT)).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits, "*whew*")
Esempio n. 5
0
    def testStems(self):
        
        searcher = IndexSearcher(self.directory)
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            self.porterAnalyzer).parse("laziness")
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "lazi")

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            self.porterAnalyzer).parse('"fox jumped"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "jump jumps jumped jumping")
Esempio n. 6
0
    def testTermRangeQuery(self):

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:[K TO N]")
        self.assert_(TermRangeQuery.instance_(query))

        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms")

        query = QueryParser(Version.LUCENE_CURRENT, "subject",
                            self.analyzer).parse("title2:{K TO Mindstorms}")
        scoreDocs = self.searcher.search(query, 10).scoreDocs
        self.assertHitsIncludeTitle(self.searcher, scoreDocs, "Mindstorms",
                                    True)
Esempio n. 7
0
    def testQueryParser(self):

        searcher = IndexSearcher(self.directory, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("+JUNIT +ANT -MOCK")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
        d = searcher.doc(scoreDocs[0].doc)
        self.assertEqual("Java Development with Ant", d.get("title"))

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            SimpleAnalyzer()).parse("mock OR junit")
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "JDwA and JIA")
 def __init__(self, location):
     lucene.initVM()
     directory = SimpleFSDirectory(File(location))
     self.reader = IndexReader.open(directory, True)
     self.searcher = IndexSearcher(self.reader)
     self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                     WhitespaceAnalyzer())
Esempio n. 9
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:', doc.get('title')
            print 'url:', doc.get('url')
            print 'src:', doc.get('src')
Esempio n. 10
0
def luceneRetriver(query):

    lucene.initVM()

    indir = SimpleFSDirectory(File(INDEXDIR))

    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

    lucene_searcher = IndexSearcher(indir)

    my_query= QueryParser(Version.LUCENE_30,"text",\

    lucene_analyzer).parse(query)

    MAX = 1000

    total_hits = lucene_searcher.search(my_query, MAX)

    print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs:

        print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString(
        )

        doc = lucene_searcher.doc(hit.doc)

        print doc.get("text").encode("utf-8")
Esempio n. 11
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        #command = 'Christian author:mark twain title:autobiography language:English'
        command = unicode(command, 'GBK')
        if command == '':
            return

        print
        print "Searching for:", command

        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            ##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'name:', doc.get("name")
            print 'title:', doc.get('title')
            print 'author:', doc.get('author')
            print 'language:', doc.get('language')
Esempio n. 12
0
def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
def run(command, pageindex,pagesize):
    global searcher,analyzer

    print "Searching for:", command 
    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k,v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            analyzer).parse(v)

        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 6000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    res = []
    for scoreDoc in scoreDocs[start:end+1]:
        doc = searcher.doc(scoreDoc.doc)
        r = []
        r.append(doc.get('title'))
        r.append(doc.get('url'))
        r.append(doc.get('src'))
        r.append(doc.get('alt').replace(' ',''))
        res.append(r)
    return res,len(scoreDocs)
Esempio n. 14
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")  #输入查询语句
        command = unicode(command, 'GBK')
        #将查询语句转化为Unicode(注意创建索引时文件也是Unicode)
        #在Python IDLE下为GBK,在PyScripter-Portable中为UTF8(见Q.ppt)
        if command == '':
            return

        command = " ".join(jieba.cut(command))
        print
        print "Searching for:", command

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        #用analyzer来对查询语句进行词法分析和语言处理。
        #QueryParser调用parser进行语法分析,形成查询语法树,放到Query中。
        scoreDocs = searcher.search(query, 50).scoreDocs
        #IndexSearcher调用search对查询语法树Query进行搜索,得到结果
        print "%s total matching documents." % len(scoreDocs), '\n'

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'title:', doc.get("title")
            print 'url:', doc.get("url")
            print 'name:', doc.get("name")
Esempio n. 15
0
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
Esempio n. 16
0
    def post(self):
      q= self.get_argument("query")

      # self.write(key)

    # def query(query):
      # query = self.get_argument("q")
      lucene.initVM()
      indexDir = "index"
      dir = SimpleFSDirectory(File(indexDir))
      analyzer = StandardAnalyzer(Version.LUCENE_30)
      searcher = IndexSearcher(dir)
      
      query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
      MAX = 10
      hits = searcher.search(query, MAX)
      
      print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
      items = []
      rQ = []
      
      #for key, value in doc_urls.iteritems() 
       # print (key, value)

      for hit in hits.scoreDocs:
          #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
          print hit.score, hit.doc, hit.toString()
          print(len(doc_urls))
          items.append(doc_urls[str(hit.doc)])
          doc = searcher.doc(hit.doc) 
          print(hit.doc)
        
      self.render("index.html", title="Results", items=items, query=q)
Esempio n. 17
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command  #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))

        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            if (k == 'site'):
                t = Term('url', '*' + v.strip() + '*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,
                                    analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs

        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>",
                                                  "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:', doc.get('title'),
            print 'url:', doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer, "contents", ori_text)
            print output
Esempio n. 18
0
    def testExactPhrase(self):

        searcher = IndexSearcher(self.directory, True)
        query = QueryParser(Version.LUCENE_24, "contents",
                            self.porterAnalyzer).parse('"over the lazy"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(0, topDocs.totalHits, "exact match not found!")
Esempio n. 19
0
    def testWildcard(self):

        try:
            QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse("*xyz")
            self.fail("Leading wildcard character should not be allowed")
        except:
            self.assert_(True)
Esempio n. 20
0
def run(command, pageindex=1, pagesize=15):
    global searcher, analyzer, old_command, old_res_list
    global STORE_DIR, directory, searcher, analyzer
    if command == '':
        return

    print "Searching for:", command  #朱莉与茱莉娅

    # final = jieba.cut(command)
    # query = QueryParser(Version.LUCENE_CURRENT, "contents",
    #                     analyzer).parse(' '.join(final))

    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k, v in command_dict.iteritems():
        if (k == 'site'):
            t = Term('url', '*' + v.strip() + '*')
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>",
                                              "</font>")

    queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                              analyzer).parse(command_dict['contents'])

    hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    for scoreDoc in scoreDocs[start:end + 1]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('title'))
        res.append(doc.get('url'))
        output = hlter.getBestFragment(analyzer, "contents",
                                       clear(doc.get('contents')))
        res.append(output)
        res_list.append(res)
    return res_list, len(scoreDocs)
Esempio n. 21
0
    def main(cls):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            cls.synonymAnalyzer).parse('"fox jumps"')
        print "\"fox jumps\" parses to ", query.toString("content")

        print "From AnalyzerUtils.tokensFromAnalysis: "
        AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"")
        print ''
Esempio n. 22
0
 def query(self,title):
     self._th.attachCurrentThread()
     searcher = IndexSearcher(self._dir)
     query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title)
     total_hits = searcher.search(query, 10)
     for hit in total_hits.scoreDocs:
         doc = (searcher.doc(hit.doc))
         return doc.get("title")+"\n"+doc.get("content")+"--------------------------------"
     return "None"
Esempio n. 23
0
def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term

    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    try:
        file = open('../features.txt', 'r')

        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line

        querywordlist = query.split()

        featureQueryList = []
        productQueryList = []

        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title",
                                         aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "

            print featureQuery

        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "

        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)
Esempio n. 24
0
    def testBasicQueryParser(self):

        analyzer = SimpleAnalyzer()
        query = QueryParser(Version.LUCENE_CURRENT, "description",
                            analyzer).parse("partnum:Q36 AND SPACE")

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        self.assertEqual("+partnum:q +space", query.toString("description"),
                         "note Q36 -> q")
        self.assertEqual(0, len(scoreDocs), "doc not found :(")
Esempio n. 25
0
    def testParseException(self):

        try:
            QueryParser(Version.LUCENE_CURRENT, "contents",
                        self.analyzer).parse("^&#")
        except:
            # expression is invalid, as expected
            self.assert_(True)
        else:
            self.fail("ParseException expected, but not thrown")
Esempio n. 26
0
    def testPerFieldAnalyzer(self):

        analyzer = PerFieldAnalyzerWrapper(SimpleAnalyzer())
        analyzer.addAnalyzer("partnum", KeywordAnalyzer())

        query = QueryParser(Version.LUCENE_CURRENT, "description",
                            analyzer).parse("partnum:Q36 AND SPACE")
        scoreDocs = self.searcher.search(query, 50).scoreDocs

        #self.assertEqual("+partnum:Q36 +space", query.toString("description"))
        self.assertEqual(1, len(scoreDocs), "doc found!")
Esempio n. 27
0
    def getCrowds(self, query, field=CrowdFields.text):
        searcher = IndexSearcher(self.index, True)
        q = QueryParser(Version.LUCENE_CURRENT, field,
                        self.analyzer).parse(query)
        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(q, collector)
        hits = collector.topDocs().scoreDocs

        return [
            searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits
        ]
Esempio n. 28
0
    def testGrouping(self):

        query = QueryParser(
            Version.LUCENE_CURRENT, "subject",
            self.analyzer).parse("(agile OR extreme) AND methodology")
        scoreDocs = self.searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(self.searcher, scoreDocs,
                                    "Extreme Programming Explained")
        self.assertHitsIncludeTitle(self.searcher, scoreDocs,
                                    "The Pragmatic Programmer")
Esempio n. 29
0
    def testAnalyzer(self):

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryString = "category:/philosophy/eastern"

        parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        parser.setAutoGeneratePhraseQueries(True)
        query = parser.parse(queryString)

        self.assertEqual("category:\"philosophy eastern\"",
                         query.toString("contents"), "path got split, yikes!")

        perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer)
        perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer())
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            perFieldAnalyzer).parse(queryString)

        self.assertEqual("category:/philosophy/eastern",
                         query.toString("contents"),
                         "leave category field alone")
Esempio n. 30
0
    def TotalSearch(self, keyWord):
        try:
            searcher = IndexSearcher(self.indexDir)
            keyWord = keyWord.encode('utf8')
            query = QueryParser(Version.LUCENE_30, "title",
                                self.analyzer).parse(keyWord)

            hits = searcher.search(query, 1000)
            return self.__MakeResultFormat(hits, searcher)
        except Exception, err:
            sys.stderr.write("ERROR: %s\n" % str(err))