Exemple #1
0
def luceneRetriver(query):

    lucene.initVM()

    indir = SimpleFSDirectory(File(INDEXDIR))

    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

    lucene_searcher = IndexSearcher(indir)

    my_query= QueryParser(Version.LUCENE_30,"text",\

    lucene_analyzer).parse(query)

    MAX = 1000

    total_hits = lucene_searcher.search(my_query, MAX)

    print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs:

        print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString(
        )

        doc = lucene_searcher.doc(hit.doc)

        print doc.get("text").encode("utf-8")
Exemple #2
0
    def post(self):
      q= self.get_argument("query")

      # self.write(key)

    # def query(query):
      # query = self.get_argument("q")
      lucene.initVM()
      indexDir = "index"
      dir = SimpleFSDirectory(File(indexDir))
      analyzer = StandardAnalyzer(Version.LUCENE_30)
      searcher = IndexSearcher(dir)
      
      query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
      MAX = 10
      hits = searcher.search(query, MAX)
      
      print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
      items = []
      rQ = []
      
      #for key, value in doc_urls.iteritems() 
       # print (key, value)

      for hit in hits.scoreDocs:
          #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
          print hit.score, hit.doc, hit.toString()
          print(len(doc_urls))
          items.append(doc_urls[str(hit.doc)])
          doc = searcher.doc(hit.doc) 
          print(hit.doc)
        
      self.render("index.html", title="Results", items=items, query=q)
Exemple #3
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Exemple #4
0
	def retrieve( self, query, max_res = 10 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'content' , lucene_analyzer ).parse( query )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		res_head = '{"query":"' + query + '","results":['
		res_tail = ']}'
		result = res_head
		hits = total_hits.totalHits
		if ( hits > 0 ):
			res_body = ''
			it = 0
			for hit in total_hits.scoreDocs:
				it += 1
				doc = lucene_searcher.doc( hit.doc )
				res_body += '{"rank":' +\
							str( it ) +\
							',"score":"' +\
							str( hit.score ) +\
							'","title":"' +\
							doc.get( 'title' ).encode('utf-8') +\
							'","id":"' +\
							doc.get( 'id' ).encode('utf-8') +\
							'"}'
				if ( it < hits ):
					res_body += ','
			result += res_body
		result += res_tail
		return result
Exemple #5
0
	def document( self, docId, max_res = 1 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'id' , lucene_analyzer ).parse( docId )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		result = '{'
		hits = total_hits.totalHits
		if ( hits == 1 ):
			for hit in total_hits.scoreDocs:
				doc = lucene_searcher.doc( hit.doc )
				result += '"id":"' +\
						  doc.get( 'id' ) +\
						  '","title":"' +\
						  doc.get( 'title' ) +\
						  '","abstract":"' +\
						  doc.get( 'abstract' ) +\
						  '","keyword":"' +\
						  doc.get( 'keyword' ) +\
						  '","content":"' +\
						  doc.get( 'content' ) +\
						  '","authors":"' +\
						  doc.get( 'authors' ) +\
						  '"'
		result += '}'
		return result
Exemple #6
0
class OccuredCandidates:
	indexDir = 'data/index'
	max_candidates = 30

	def __init__(self):
		lucene.initVM()
		self._lversion = Version.LUCENE_30
		self._analyzer = EnglishAnalyzer(self._lversion)
		self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir)))

		self._translation = loadTranslation()
		self._links = loadLinks()

	def find(self, phrase):
		phrase = phrase.lower().encode('utf8')
		query = ' '.join(['+'+ word for word in phrase.split(' ')]);
		query = QueryParser(self._lversion, 'contents', self._analyzer).parse(query)
		hits = self._searcher.search(query, self.max_candidates)

		# if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug

		# todo put article_id in lucene index instead of translating document title

		links = {}
		for hit in hits.scoreDocs:
			title = quote(self._searcher.doc(hit.doc).get("title").encode('utf-8').replace(' ', '_')).replace('%28', '(').replace('%29', ')')
			if title in self._translation:
				links[self._translation[title]] = hit.score
			# else: print title # potential bug

		return self._links[phrase].get(-1, 0), links

	def clear_links(self, annotations):
		return filter(lambda annotation: annotation['links'] and max(annotation['links'].values()) > 1, annotations)
def lucene_search(index_dir, limit, query_text):
    '''
    lucene_search: Search a built index and return upto limit number of responses 
    Arguments: Input index folder, limit value of results returned, query(as string)
    Returns: paths of responsive files as list
    '''
    
    logging.basicConfig(file=os.path.join(index_dir,"lucene_search.log"))
    logger.info("Initializing search....")
    lucene.initVM()
    logger.info("Reading index from "+index_dir)
    index = SimpleFSDirectory(File(index_dir))
    analyzer = StandardAnalyzer(Version.LUCENE_30) #Lucene version used to generate index
    searcher = IndexSearcher(index)
    
    logger.info("Parsing query :"+ query_text)
    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query_text)
    hits = searcher.search(query, limit)

    logger.info("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    hit_paths = []

    for hit in hits.scoreDocs:
        # The following code also generates score for responsive/found documents and the 
        # content index which matched
        # print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        hit_paths.append(doc.get("path"))
    
    return hit_paths 
Exemple #8
0
def luceneRetriver(query):

	lucene.initVM()

	indir = SimpleFSDirectory(File(INDEXDIR))

	lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

	lucene_searcher = IndexSearcher(indir)

	my_query = QueryParser(Version.LUCENE_30,"text",\

	lucene_analyzer).parse(query)

	MAX = 1000

	total_hits = lucene_searcher.search(my_query,MAX)

	print "Hits: ",total_hits.totalHits

	for hit in total_hits.scoreDocs:

		print "Hit Score: ",hit.score, "Hit Doc:",hit.doc, "Hit String:",hit.toString()

		doc = lucene_searcher.doc(hit.doc)

		print doc.get("text").encode("utf-8")
Exemple #9
0
    def searchWithRequestAndQuery(cls, query, indexReader, taxoReader,
                                  indexingParams, facetRequest):
        """
        Search an index with facets for given query and facet requests.
        returns a List<FacetResult>
        """
        # prepare searcher to search against
        searcher = IndexSearcher(indexReader)
        # collect matching documents into a collector
        topDocsCollector = TopScoreDocCollector.create(10, True)
        if not indexingParams:
            indexingParams = DefaultFacetIndexingParams()

        # Faceted search parameters indicate which facets are we interested in
        facetSearchParams = FacetSearchParams(indexingParams)
        # Add the facet request of interest to the search params
        facetSearchParams.addFacetRequest(facetRequest)
        facetsCollector = FacetsCollector(facetSearchParams, indexReader,
                                          taxoReader)
        # perform documents search and facets accumulation
        searcher.search(
            query, MultiCollector.wrap([topDocsCollector, facetsCollector]))
        # Obtain facets results and print them
        res = facetsCollector.getFacetResults()
        i = 0
        for facetResult in res:
            print "Result #%d has %d descendants" % (
                i, facetResult.getNumValidDescendants())
            print "Result #%d : %s" % (i, facetResult)
            i += 1

        return res
def get_doc_details(doc_id, lucene_index_dir):
    '''
    This function gets a file's details from 
    the lucene index. 
    
    Arguments: 
        doc_id - file id
        lucene_index_dir - lucene index directory 
    
    Returns: 
        file details as a list 
    '''
    
    store = SimpleFSDirectory(File(lucene_index_dir))
    searcher = IndexSearcher(store, True)
    
    doc = searcher.doc(doc_id)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    row = []
    metadata = MetadataType._types
    for field in metadata:
        if table.get(field,'empty') != 'empty' :
            row.append(table.get(field,'empty'))
        else: 
            row.append('')
    row.append(str(table.get(MetadataType.FILE_ID,'empty')))

    return row 
Exemple #11
0
def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
def run(writer, analyzer):
	while True:
		print 
		print "Hit enter with no input to quit."
		command = raw_input("Query:")
		if command == '':
			return

		print "Searching for:", command
		IndexReader = writer.getReader()
		searcher = IndexSearcher(IndexReader)
		#query = QueryParser(Version.LUCENE_CURRENT, "hashtag", analyzer).parse(command)
		#scoreDocs = searcher.search(query, 50).scoreDocs
		wildquery = command + "*"
		term = Term("hashtag", wildquery)
		query = WildcardQuery(term)
		scoreDocs = searcher.search(query, 5).scoreDocs
		print "%s total matching documents." % len(scoreDocs)
		
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			
			score = ( len(command) / len(doc.get("hashtag")) ) * scoreDoc.score
			print 'tweet:', doc.get("contents")
			print 'user_name:', doc.get("user_name")
			print 'when', doc.get("creation_date")
def get_indexed_file_details(ts_results, lucene_index_dir):
    '''
    This function gets each files details from the lucene 
    index. 
    
    Arguments: 
        ts_results - topic search results, each item contains 
                     [file id, root, file name, similarity score]
        lucene_index_dir - lucene index directory 
    
    Returns: 
        file details in a list 
    '''
    
    store = SimpleFSDirectory(File(lucene_index_dir))
    searcher = IndexSearcher(store, True)
    
    rows = []
    for rs in ts_results:
        doc = searcher.doc(rs[0])
        table = dict((field.name(), field.stringValue())
                     for field in doc.getFields())
        row = []
        metadata = MetadataType._types
        for field in metadata:
            if table.get(field,'empty') != 'empty' :
                row.append(table.get(field,'empty'))
            else: 
                row.append('')
        row.append(str(table.get(MetadataType.FILE_ID,'empty')))
        row.append(str(rs[3])) # similarity score
        
        rows.append(row)
    
    return rows
 def __init__(self, location):
     lucene.initVM()
     directory = SimpleFSDirectory(File(location))
     self.reader = IndexReader.open(directory, True)
     self.searcher = IndexSearcher(self.reader)
     self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text",
                                     WhitespaceAnalyzer())
Exemple #15
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
Exemple #16
0
    def testChinese(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("contents", "道"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertEqual(1, len(scoreDocs), "tao")
Exemple #17
0
    def query(indexName, queryString):

        indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName)))
        qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT))
        qp.setDefaultOperator(qp.Operator.AND)
         
        query = qp.parse(queryString.replace("-","_"))
                
        aux = indSearcher.search(query, 100)
        results = aux.scoreDocs
        hits = aux.totalHits
        
        ir = indSearcher.getIndexReader()

        #results = collector.topDocs()
        i = 0

        res = []
    
        for r in results:        
            doc = ir.document(i)
            res.insert(i, doc.get('id'))
            i+=1
            
        return res
    def testKeyword(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("isbn", "1930110995")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JUnit in Action")
    def testKeyword(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("isbn", "1930110995")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JUnit in Action")
    def testTermRangeQuery(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermRangeQuery("title2", "d", "j", True, True)

        topDocs = searcher.search(query, 100)
        self.assertEqual(3, topDocs.totalHits)
        searcher.close()
Exemple #21
0
    def testExactPhrase(self):

        searcher = IndexSearcher(self.directory, True)
        query = QueryParser(Version.LUCENE_24, "contents",
                            self.porterAnalyzer).parse('"over the lazy"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(0, topDocs.totalHits, "exact match not found!")
Exemple #22
0
    def testTermRangeQuery(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermRangeQuery("title2", "d", "j", True, True)

        topDocs = searcher.search(query, 100)
        self.assertEqual(3, topDocs.totalHits)
        searcher.close()
Exemple #23
0
    def __init__(self):
        lucene.initVM()
        self._lversion = Version.LUCENE_30
        self._analyzer = EnglishAnalyzer(self._lversion)
        self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir)))

        self._translation = loadTranslation()
        self._links = loadLinks()
    def main(cls, argv):

        if len(argv) != 2:
            print "Usage: BerkeleyDbSearcher <index dir>"
            return

        dbHome = argv[1]

        env = DBEnv()
        env.set_flags(DB_LOG_INMEMORY, 1);
        if os.name == 'nt':
            env.set_cachesize(0, 0x4000000, 1)
        elif os.name == 'posix':
            from commands import getstatusoutput
            if getstatusoutput('uname') == (0, 'Linux'):
                env.set_cachesize(0, 0x4000000, 1)

        env.open(dbHome, (DB_THREAD |
                          DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0)

        index = DB(env)
        blocks = DB(env)
        txn = None

        try:
            txn = env.txn_begin(None)
            index.open(filename = '__index__', dbtype = DB_BTREE,
                       flags = DB_THREAD, txn = txn)
            blocks.open(filename = '__blocks__', dbtype = DB_BTREE,
                        flags = DB_THREAD, txn = txn)
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            txn = None

        try:
            txn = env.txn_begin(None)
            directory = DbDirectory(txn, index, blocks, 0)
            searcher = IndexSearcher(directory, True)

            topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50)
            print topDocs.totalHits, "document(s) found"
            searcher.close()
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.abort()

            index.close()
            blocks.close()
            env.close()
Exemple #25
0
    def __init__(self, user_loc_string, debug=False):
        analyzer = StopAnalyzer()
        fields = ['name', 'alternate_names', 'state']
        directory = FSDirectory.getDirectory("index")

        self.DEBUG = debug
        self.searcher = IndexSearcher(directory)
        self.parser = MultiFieldQueryParser(fields, analyzer)
        self.user_location = self.doSearch(user_loc_string)
 def query(self,title):
     self._th.attachCurrentThread()
     searcher = IndexSearcher(self._dir)
     query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title)
     total_hits = searcher.search(query, 10)
     for hit in total_hits.scoreDocs:
         doc = (searcher.doc(hit.doc))
         return doc.get("title")+"\n"+doc.get("content")+"--------------------------------"
     return "None"
    def getHitCount(self, fieldName, searchString):

        searcher = IndexSearcher(self.dir, True)
        t = Term(fieldName, searchString)
        query = TermQuery(t)
        hitCount = len(searcher.search(query, 50).scoreDocs)
        searcher.close()

        return hitCount
class MultiPhraseQueryTest(TestCase):

    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(Field("field", "the quick brown fox jumped over the lazy dog",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(Field("field", "the fast fox hopped over the hound",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)

    def testBasic(self):
        
        query = MultiPhraseQuery()
        query.add([Term("field", "quick"),
                   Term("field", "fast")])
        query.add(Term("field", "fox"))
        print query

        topDocs = self.searcher.search(query, 10)
        self.assertEqual(1, topDocs.totalHits, "fast fox match")

        query.setSlop(1);
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits, "both match");

    def testAgainstOR(self):

        quickFox = PhraseQuery()
        quickFox.setSlop(1)
        quickFox.add(Term("field", "quick"))
        quickFox.add(Term("field", "fox"))

        fastFox = PhraseQuery()
        fastFox.add(Term("field", "fast"))
        fastFox.add(Term("field", "fox"))

        query = BooleanQuery()
        query.add(quickFox, BooleanClause.Occur.SHOULD)
        query.add(fastFox, BooleanClause.Occur.SHOULD)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits)

    def debug(self, hits):

        for i, doc in hits:
            print "%s: %s" %(hits.score(i), doc['field'])
Exemple #29
0
class MultiPhraseQueryTest(TestCase):
    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)

    def testBasic(self):

        query = MultiPhraseQuery()
        query.add([Term("field", "quick"), Term("field", "fast")])
        query.add(Term("field", "fox"))
        print query

        topDocs = self.searcher.search(query, 10)
        self.assertEqual(1, topDocs.totalHits, "fast fox match")

        query.setSlop(1)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits, "both match")

    def testAgainstOR(self):

        quickFox = PhraseQuery()
        quickFox.setSlop(1)
        quickFox.add(Term("field", "quick"))
        quickFox.add(Term("field", "fox"))

        fastFox = PhraseQuery()
        fastFox.add(Term("field", "fast"))
        fastFox.add(Term("field", "fox"))

        query = BooleanQuery()
        query.add(quickFox, BooleanClause.Occur.SHOULD)
        query.add(fastFox, BooleanClause.Occur.SHOULD)
        topDocs = self.searcher.search(query, 10)
        self.assertEqual(2, topDocs.totalHits)

    def debug(self, hits):

        for i, doc in hits:
            print "%s: %s" % (hits.score(i), doc['field'])
Exemple #30
0
def search(command=command1):
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    res = searcher.search(query, 1000000)
    print 'Total hits:', res.totalHits
#    return searcher, res
    return [searcher.doc(doc.doc) for doc in res.scoreDocs[:20]]
    def getHitCount(self, fieldName, searchString):

        searcher = IndexSearcher(self.dir, True)
        t = Term(fieldName, searchString)
        query = TermQuery(t)
        hitCount = len(searcher.search(query, 50).scoreDocs)
        searcher.close()

        return hitCount
Exemple #32
0
 def getCrowds(self, query, field = CrowdFields.text): 
     searcher = IndexSearcher(self.index, True)
     q = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query)
     collector = TopScoreDocCollector.create(hitsPerPage, True)
     searcher.search(q, collector)
     hits = collector.topDocs().scoreDocs
     
     return [
         searcher.doc(scoreDoc.doc).get(CrowdFields.id)
         for scoreDoc in hits]
def SearchFiles(command):
    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    # print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    rankedfiles = run(searcher, analyzer, command)
    searcher.close()
    return rankedfiles
Exemple #34
0
 def TotalSearch(self, keyWord):
     try:
         searcher = IndexSearcher(self.indexDir)
         keyWord = keyWord.encode('utf8')
         query = QueryParser(Version.LUCENE_30, "title", self.analyzer).parse(keyWord)
         
         hits = searcher.search(query, 1000)
         return self.__MakeResultFormat(hits, searcher)
     except Exception, err:
         sys.stderr.write("ERROR: %s\n"% str(err))
Exemple #35
0
def search(command):
    STORE_DIR = "index"
    vm_env = initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, command)
    searcher.close()
    return result
Exemple #36
0
def Searchfile(command, prior, page, RPP):
    STORE_DIR = "index_ans"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    store = run(searcher, analyzer, command, prior)
    searcher.close()
    start = (page - 1) * RPP
    end = start + RPP

    return store[start:end], len(store)
class LuceneSearch(object):
    def __init__(self):
        STORE_DIR = "index"
        initVM()
        print 'lucene', VERSION
        self.directory = SimpleFSDirectory(File(STORE_DIR))
        print self.directory
        self.searcher = IndexSearcher(self.directory, True)
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    def close(self):
        self.searcher.close()
    
    def raw_search(self, query_string):
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            self.analyzer).parse(query_string)
        scoreDocs = self.searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
        matches = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            #print 'doc matched = ', dir(doc)
            contents = LuceneDoc.load(doc.get('name'))
            matches.append({'contents' : contents, 'doc' : doc})
        return matches
           
    def search(self, query):
        matches = self.raw_search(query)
        results = ''
        if len(matches) > 0:
            results += str(len(matches))+" results <br/>"
            for match in matches:
                results += '<a href='+str(match['contents']['dealUrl'])+'>'+str(match['contents']['merchant'])+'</a><br />'
                results += '<p>'+str(match['contents']['shortAnnouncementTitle'])+','+str(match['contents']['redemptionLocation'])+'</p><br/>'
        else:
            results = "0 results <br/>"
        return results
        
    def cli_search(self):
        while True:
            print
            print "Hit enter with no input to quit."
            command = raw_input("Query:")
            if command == '':
                return
            matches = self.raw_search(command)
            print
            print "Searching for:", command
            
            for match in matches:
                print match['contents']['dealUrl']
                print match['contents']['merchant'], ',', match['contents']['redemptionLocation'], ', ', match['contents']['div']
                print match['contents']['shortAnnouncementTitle']
                print '-'*80
    def testExclusive(self):

        searcher = IndexSearcher(self.directory, True)
        # pub date of TTC was October 1988
        query = NumericRangeQuery.newIntRange("pubmonth",
                                              Integer(198805),
                                              Integer(198810),
                                              False, False)
        topDocs = searcher.search(query, 100)
        self.assertEqual(0, topDocs.totalHits)
        searcher.close()
 def GET(self):
     form1 = login()
     user_data = web.input()
     vm_env.attachCurrentThread()
     STORE_DIR = "F:\\imgindex"
     directory = SimpleFSDirectory(File(STORE_DIR))
     searcher = IndexSearcher(directory, True)
     analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
     a,b,c,d,e = img_func(user_data.keyword,searcher,analyzer)
     searcher.close()
     return render.img_result(form1,a,b,c,d,e)
    def testExclusive(self):

        searcher = IndexSearcher(self.directory, True)
        # pub date of TTC was October 1988
        query = NumericRangeQuery.newIntRange("pubmonth",
                                              198805,
                                              198810,
                                              False, False)
        topDocs = searcher.search(query, 100)
        self.assertEqual(0, topDocs.totalHits)
        searcher.close()
Exemple #41
0
    def getCrowds(self, query, field=CrowdFields.text):
        searcher = IndexSearcher(self.index, True)
        q = QueryParser(Version.LUCENE_CURRENT, field,
                        self.analyzer).parse(query)
        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(q, collector)
        hits = collector.topDocs().scoreDocs

        return [
            searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits
        ]
Exemple #42
0
    def TotalSearch(self, keyWord):
        try:
            searcher = IndexSearcher(self.indexDir)
            keyWord = keyWord.encode('utf8')
            query = QueryParser(Version.LUCENE_30, "title",
                                self.analyzer).parse(keyWord)

            hits = searcher.search(query, 1000)
            return self.__MakeResultFormat(hits, searcher)
        except Exception, err:
            sys.stderr.write("ERROR: %s\n" % str(err))
def pesquisar_com_lucene():
    initVM()
    #print 'lucene', VERSION

    # Get handle to index directory
    directory = SimpleFSDirectory(File(STORE_DIR))

    # Creates a searcher searching the provided index.
    ireader  = IndexReader.open(directory, True)

    # Implements search over a single IndexReader.
    # Use a single instance and use it across queries
    # to improve performance.
    searcher = IndexSearcher(ireader)

    # Get the analyzer
    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

    for query in querys:
        query_number =  query.query_number
        # Constructs a query parser. We specify what field to search into.
        query.query_text = query.query_text.replace('?','')
        query.query_text = query.query_text.replace('*','')
        queryParser = QueryParser(Version.LUCENE_CURRENT,
                                  FIELD_CONTENTS, analyzer)

        # Create the query
        query = queryParser.parse(query.query_text)

        # Run the query and get top 50 results
        topDocs = searcher.search(query,50000)

        # Get top hits
        scoreDocs = topDocs.scoreDocs

        r = resultado_query(query_number,scoreDocs)
        resultados.append(r)
        #print "%s total matching documents." % len(scoreDocs)
        #for scoreDoc in scoreDocs:
        #    doc = searcher.doc(scoreDoc.doc)
        #    print doc.get(FIELD_PATH)

    with open('resultados_da_busca/resultados.csv', 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in resultados:
            resultados_da_row = []
            i = 1
            for resultado_da_query in row.query_results:
                doc = searcher.doc(resultado_da_query.doc)
                resultados_da_row.append((i,int(doc.get(FIELD_PATH))))
                i = i + 1
            spamwriter.writerow([row.query_number,resultados_da_row])
Exemple #44
0
def begining(command):
    STORE_DIR = "index"
    global vm_env
    vm_env = initVM()
    vm_env.attachCurrentThread()
    #print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = lucene.WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    a = run(command, searcher, analyzer)
    searcher.close()
    return a
Exemple #45
0
    def LatestSearch(self):
        try:
            searcher = IndexSearcher(self.indexDir)
            today = time.strftime('%Y%m%d')
            keyWord = today.encode('utf8')
            print keyWord
            query = QueryParser(Version.LUCENE_30, "regDate", self.analyzer).parse(keyWord)

            hits = searcher.search(query, 1000)
            return self.__MakeResultFormat(hits, searcher)
        except:
            print 'BookSearcher TotalSearch Exception'
Exemple #46
0
    def testWithSlop(self):

        searcher = IndexSearcher(self.directory, True)

        parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                             self.porterAnalyzer)
        parser.setPhraseSlop(1)

        query = parser.parse('"over the lazy"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "hole accounted for")
Exemple #47
0
    def LatestSearch(self):
        try:
            searcher = IndexSearcher(self.indexDir)
            today = time.strftime('%Y%m%d')
            keyWord = today.encode('utf8')
            print keyWord
            query = QueryParser(Version.LUCENE_30, "regDate",
                                self.analyzer).parse(keyWord)

            hits = searcher.search(query, 1000)
            return self.__MakeResultFormat(hits, searcher)
        except:
            print 'BookSearcher TotalSearch Exception'
    def testTerm(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("subject", "ant")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JDwA")

        t = Term("subject", "junit")
        scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs
        self.assertEqual(2, len(scoreDocs))

        searcher.close()
Exemple #49
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.synonymAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("content", "The quick brown fox jumps over the lazy dogs",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
    def testAnd(self):

        searchingBooks = TermQuery(Term("subject", "search"))
        books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401),
                                                  Integer(200412), True, True)

        searchingBooks2004 = BooleanQuery()
        searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST)
        searchingBooks2004.add(books2004, BooleanClause.Occur.MUST)

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
Exemple #51
0
    def testSpecifiedOperator(self):

        MUST = BooleanClause.Occur.MUST
        query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                            "development",
                                            ["title", "subject"], [MUST, MUST],
                                            SimpleAnalyzer())

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 50).scoreDocs

        self.assertHitsIncludeTitle(searcher, scoreDocs,
                                    "Java Development with Ant")
        self.assertEqual(1, len(scoreDocs), "one and only one")
    def testIdRangeQuery(self):

        parser = CustomQueryParser("field", self.analyzer)

        query = parser.parse("id:[37 TO 346]")
        self.assertEqual("id:[0000000037 TO 0000000346]",
                         query.toString("field"), "padded")

        searcher = IndexSearcher(self.directory, True)
        scoreDocs = searcher.search(query, 1000).scoreDocs
        self.assertEqual(310, len(scoreDocs))

        print parser.parse("special:[term TO *]")
        print parser.parse("special:[* TO term]")
    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)
Exemple #54
0
    def testStems(self):
        
        searcher = IndexSearcher(self.directory)
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            self.porterAnalyzer).parse("laziness")
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "lazi")

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            self.porterAnalyzer).parse('"fox jumped"')
        topDocs = searcher.search(query, 50)

        self.assertEqual(1, topDocs.totalHits, "jump jumps jumped jumping")