def displayResults(self, query, sort):

        searcher = IndexSearcher(self.directory, True)

        fillFields = False
        computeMaxScore = False
        docsScoredInOrder = False
        computeScores = True

        collector = TopFieldCollector.create(sort, 20,
                                             fillFields,
                                             computeScores,
                                             computeMaxScore,
                                             docsScoredInOrder)

        searcher.search(query, None, collector)
        scoreDocs = collector.topDocs().scoreDocs

        print "\nResults for:", query, "sorted by", sort
        print "Title".rjust(30), "pubmonth".rjust(10), \
              "id".center(4), "score".center(15)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            if len(title) > 30:
                title = title[:30]
            print title.encode('ascii', 'replace').rjust(30), \
                  doc["pubmonth"].rjust(10), \
                  str(scoreDoc.doc).center(4), \
                  ("%06f" % (scoreDoc.score)).rjust(12)
            print "  ", doc["category"]
            # print searcher.explain(query, scoreDoc.doc)

        searcher.close()
Esempio n. 2
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
Esempio n. 3
0
    def testTermRangeQuery(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermRangeQuery("title2", "d", "j", True, True)

        topDocs = searcher.search(query, 100)
        self.assertEqual(3, topDocs.totalHits)
        searcher.close()
Esempio n. 4
0
    def testTermRangeQuery(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermRangeQuery("title2", "d", "j", True, True)

        topDocs = searcher.search(query, 100)
        self.assertEqual(3, topDocs.totalHits)
        searcher.close()
Esempio n. 5
0
    def main(cls, argv):

        if len(argv) != 2:
            print "Usage: BerkeleyDbSearcher <index dir>"
            return

        dbHome = argv[1]

        env = DBEnv()
        env.set_flags(DB_LOG_INMEMORY, 1);
        if os.name == 'nt':
            env.set_cachesize(0, 0x4000000, 1)
        elif os.name == 'posix':
            from commands import getstatusoutput
            if getstatusoutput('uname') == (0, 'Linux'):
                env.set_cachesize(0, 0x4000000, 1)

        env.open(dbHome, (DB_THREAD |
                          DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0)

        index = DB(env)
        blocks = DB(env)
        txn = None

        try:
            txn = env.txn_begin(None)
            index.open(filename = '__index__', dbtype = DB_BTREE,
                       flags = DB_THREAD, txn = txn)
            blocks.open(filename = '__blocks__', dbtype = DB_BTREE,
                        flags = DB_THREAD, txn = txn)
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            txn = None

        try:
            txn = env.txn_begin(None)
            directory = DbDirectory(txn, index, blocks, 0)
            searcher = IndexSearcher(directory, True)

            topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50)
            print topDocs.totalHits, "document(s) found"
            searcher.close()
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.abort()

            index.close()
            blocks.close()
            env.close()
Esempio n. 6
0
    def getHitCount(self, fieldName, searchString):

        searcher = IndexSearcher(self.dir, True)
        t = Term(fieldName, searchString)
        query = TermQuery(t)
        hitCount = len(searcher.search(query, 50).scoreDocs)
        searcher.close()

        return hitCount
Esempio n. 7
0
    def getHitCount(self, fieldName, searchString):

        searcher = IndexSearcher(self.dir, True)
        t = Term(fieldName, searchString)
        query = TermQuery(t)
        hitCount = len(searcher.search(query, 50).scoreDocs)
        searcher.close()

        return hitCount
def SearchFiles(command):
    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    # print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    rankedfiles = run(searcher, analyzer, command)
    searcher.close()
    return rankedfiles
Esempio n. 9
0
def search(command):
    STORE_DIR = "index"
    vm_env = initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, command)
    searcher.close()
    return result
Esempio n. 10
0
def Searchfile(command, prior, page, RPP):
    STORE_DIR = "index_ans"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    store = run(searcher, analyzer, command, prior)
    searcher.close()
    start = (page - 1) * RPP
    end = start + RPP

    return store[start:end], len(store)
    def testExclusive(self):

        searcher = IndexSearcher(self.directory, True)
        # pub date of TTC was October 1988
        query = NumericRangeQuery.newIntRange("pubmonth",
                                              198805,
                                              198810,
                                              False, False)
        topDocs = searcher.search(query, 100)
        self.assertEqual(0, topDocs.totalHits)
        searcher.close()
 def GET(self):
     form1 = login()
     user_data = web.input()
     vm_env.attachCurrentThread()
     STORE_DIR = "F:\\imgindex"
     directory = SimpleFSDirectory(File(STORE_DIR))
     searcher = IndexSearcher(directory, True)
     analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
     a,b,c,d,e = img_func(user_data.keyword,searcher,analyzer)
     searcher.close()
     return render.img_result(form1,a,b,c,d,e)
Esempio n. 13
0
class LuceneSearch(object):
    def __init__(self):
        STORE_DIR = "index"
        initVM()
        print 'lucene', VERSION
        self.directory = SimpleFSDirectory(File(STORE_DIR))
        print self.directory
        self.searcher = IndexSearcher(self.directory, True)
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    def close(self):
        self.searcher.close()
    
    def raw_search(self, query_string):
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            self.analyzer).parse(query_string)
        scoreDocs = self.searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)
        matches = []
        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            #print 'doc matched = ', dir(doc)
            contents = LuceneDoc.load(doc.get('name'))
            matches.append({'contents' : contents, 'doc' : doc})
        return matches
           
    def search(self, query):
        matches = self.raw_search(query)
        results = ''
        if len(matches) > 0:
            results += str(len(matches))+" results <br/>"
            for match in matches:
                results += '<a href='+str(match['contents']['dealUrl'])+'>'+str(match['contents']['merchant'])+'</a><br />'
                results += '<p>'+str(match['contents']['shortAnnouncementTitle'])+','+str(match['contents']['redemptionLocation'])+'</p><br/>'
        else:
            results = "0 results <br/>"
        return results
        
    def cli_search(self):
        while True:
            print
            print "Hit enter with no input to quit."
            command = raw_input("Query:")
            if command == '':
                return
            matches = self.raw_search(command)
            print
            print "Searching for:", command
            
            for match in matches:
                print match['contents']['dealUrl']
                print match['contents']['merchant'], ',', match['contents']['redemptionLocation'], ', ', match['contents']['div']
                print match['contents']['shortAnnouncementTitle']
                print '-'*80
    def testExclusive(self):

        searcher = IndexSearcher(self.directory, True)
        # pub date of TTC was October 1988
        query = NumericRangeQuery.newIntRange("pubmonth",
                                              Integer(198805),
                                              Integer(198810),
                                              False, False)
        topDocs = searcher.search(query, 100)
        self.assertEqual(0, topDocs.totalHits)
        searcher.close()
Esempio n. 15
0
def begining(command):
    STORE_DIR = "index"
    global vm_env
    vm_env = initVM()
    vm_env.attachCurrentThread()
    #print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = lucene.WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    a = run(command, searcher, analyzer)
    searcher.close()
    return a
Esempio n. 16
0
    def testTerm(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("subject", "ant")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JDwA")

        t = Term("subject", "junit")
        scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs
        self.assertEqual(2, len(scoreDocs))

        searcher.close()
Esempio n. 17
0
    def testTerm(self):

        searcher = IndexSearcher(self.directory, True)
        t = Term("subject", "ant")
        query = TermQuery(t)
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs), "JDwA")

        t = Term("subject", "junit")
        scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs
        self.assertEqual(2, len(scoreDocs))

        searcher.close()
Esempio n. 18
0
def search_lucene_index(search_params, index_dir, index_metadata,
                        records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None

    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params,
                                            search_params.get('operator'),
                                            analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields,
                                       analyzer)
        query = MultiFieldQueryParser.parse(
            parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset + i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches(explain.toString(), index_fields)

        results.append((term_id, name, list(match_fields)))

    searcher.close()
    return (results, total_hits)
Esempio n. 19
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Esempio n. 20
0
    def _create_pairs(self, inpDF, colname, idd):
        """
		function to create cartesian pairs of matched-similar text records
		first calls create index function followed by search index row by row 
		in a pandas dataframe 
		"""

        lookup_dir = {}
        for i, row in inpDF.iterrows():
            if row[colname] not in lookup_dir:
                lookup_dir[row[colname]] = []
            else:
                lookup_dir[row[colname]].append(row[idd])

        pairs = []
        directory = self._createIndex(inpDF, colname)

        searcher = IndexSearcher(directory, True)
        matches = inpDF.apply(
            lambda x: self._searchIndex(searcher, x, colname, idd), axis=1)

        captured_candidates = {}
        for match_pair in matches:
            for matched in match_pair:
                # value_index = inpDF[inpDF[colname] == matched[2]].index.tolist()
                value_index = lookup_dir[matched[2]]
                for cell_index in value_index:
                    if matched[0] != cell_index:

                        rstring = "-".join(
                            sorted([str(matched[0]),
                                    str(cell_index)]))
                        if rstring not in captured_candidates:
                            captured_candidates[rstring] = 1

                            row = []
                            row.append(matched[0])
                            row.append(matched[1])
                            row.append(cell_index)
                            row.append(matched[2])
                            pairs.append(row)

        searcher.close()
        directory.close()

        header = [idd, colname, idd + "_", colname + "_"]
        pairDF = pd.DataFrame(pairs, columns=header)
        return pairDF
Esempio n. 21
0
def run(command):
    if command == '':
        return None
    STORE_DIR = "index"
    initVM(CLASSPATH)
    directory = FSDirectory.getDirectory(STORE_DIR, False)
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer()
    
    parser = QueryParser("contents", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    parser.setFuzzyMinSim(0.2)
    query = parser.parse(command)
    hits = map(transform, searcher.search(query))
    searcher.close()
    return hits
Esempio n. 22
0
    def testCollecting(self):

        query = TermQuery(Term("contents", "junit"))
        searcher = IndexSearcher(self.directory, True)

        collector = BookLinkCollector(searcher)
        searcher.search(query, collector)

        links = collector.getLinks()
        self.assertEqual("java development with ant",
                         links["http://www.manning.com/antbook"])

        scoreDocs = searcher.search(query, 10).scoreDocs
        self.dumpHits(searcher, scoreDocs)

        searcher.close()
Esempio n. 23
0
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page):
    """
    Uses the query term provided to search the disease ontology lucene index
    """
    results = []

    index_dir = SimpleFSDirectory(File(index_dir))
    analyzer = build_perfield_analyzer(index_metadata)
    searcher = IndexSearcher(index_dir)
    index_fields = index_metadata.keys()

    # Since we are paging results we want to grab what page we are on   
    page = (int(search_params.get('page', 1))) - 1

    # Doing something pretty hacky here since we are trying to move from 0-based to 1 
    # based indexing to match our pagingation display
    offset = int(page) * records_per_page

    # If we are executing an advanced search we will be building a BooleanQuery
    # in parts as opposed to the one MultiFieldQueryParser when doing a basic
    # serach
    query = None
    
    if search_params.get('adv_search') == "True":
        query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer)
    else:
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer)
        query = MultiFieldQueryParser.parse(parser, process_query_param((search_params.get('q'))))

    # Parse through our hits
    hits = searcher.search(query, 10000)
    total_hits = hits.totalHits
    count = min(hits.totalHits - offset, records_per_page)

    for i in xrange(0, count):
        score_doc = hits.scoreDocs[offset+i]
        doc = searcher.doc(score_doc.doc)
        term_id = doc.get('term id')
        name = doc.get('name')
        explain = searcher.explain(query, score_doc.doc)
        match_fields = get_field_matches( explain.toString(), index_fields )

        results.append( (term_id, name, list(match_fields)) )

    searcher.close()
    return (results, total_hits)
Esempio n. 24
0
	def doSearch(self,searchString,fieldToSearch,luceneDir):
		searchResult =[]
		store =	SimpleFSDirectory(File(luceneDir))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		searcher = IndexSearcher(store)
		query = QueryParser(Version.LUCENE_CURRENT,fieldToSearch,analyzer).parse(searchString)
		hits = searcher.search(query,self.MAX)
		
		print "Found %d documents that matched the query '%s'" %(hits.totalHits,searchString)
		for hit in hits.scoreDocs:
			doc = searcher.doc(hit.doc)
			#docdict['score'] = hit.score
			#docdict['docid'] = hit.doc
			#docdict['content'] = doc.get("contents").encode("utf-8")
			searchResult.append([doc.get("title").encode("utf-8"),doc.get("contents").encode("utf-8")])
		searcher.close()	
		return searchResult
Esempio n. 25
0
    def delete_old(self, index):
        existing_ids = set([book.id for book in Book.objects.all()])

        reader = IndexReader.open(index.index, False)
        searcher = IndexSearcher(reader)
        try:
            num = searcher.docFreq(Term('is_book', 'true'))
            docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num)
            for result in docs.scoreDocs:
                stored = searcher.doc(result.doc)
                book_id = int(stored.get('book_id'))
                if not book_id in existing_ids:
                    print "book id %d doesn't exist." % book_id
                    index.remove_book(book_id)
        finally:
            searcher.close()
            reader.close()
Esempio n. 26
0
    def testSimple(self):

        class SimpleSimilarity(PythonSimilarity):

            def lengthNorm(_self, field, numTerms):
                return 1.0

            def queryNorm(_self, sumOfSquaredWeights):
                return 1.0

            def tf(_self, freq):
                return freq

            def sloppyFreq(_self, distance):
                return 2.0

            def idfTerms(_self, terms, searcher):
                return 1.0

            def idf(_self, docFreq, numDocs):
                return 1.0

            def coord(_self, overlap, maxOverlap):
                return 1.0

            def scorePayload(_self, docId, fieldName, start, end, payload,
                             offset, length):
                return 1.0

        self.indexSingleFieldDocs([Field("contents", "x", Field.Store.YES,
                                         Field.Index.ANALYZED)])
        searcher = IndexSearcher(self.directory)
        searcher.setSimilarity(SimpleSimilarity())

        query = TermQuery(Term("contents", "x"))
        explanation = searcher.explain(query, 0)
        print explanation

        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))

        self.assertEqual(scoreDocs[0].score, 1.0)
        searcher.close()
Esempio n. 27
0
def a():
    import os
    from lucene import CJKAnalyzer,Hit
    dire = os.path.dirname(__file__) + '/index'

    analyzer = CJKAnalyzer()
    searcher = IndexSearcher(dire)
    query = QueryParser('summary',analyzer).parse('java')#TermQuery(Term("type", "restaurant"))
    sort = Sort(SortField("locatisdon", CategoryComparatorSource('java')))

    hits = searcher.search(query,sort)
    print len(hits)
    i = 0
    for hit in hits:
            i+=1
            if i== 10:
                break
            doc = Hit.cast_(hit).getDocument()
            print 'title:', doc.get("author"), 'name:', doc.get("link")
            print Hit.cast_(hit).getScore()
    searcher.close()
Esempio n. 28
0
    def calculateWeight(self,context):
        #try:
            self.termList = context.termList            
            ramreader = IndexReader.open(context.ramIndex,True)
            store = SimpleFSDirectory(File(context.luceneDir))
            storereader = IndexReader.open(store)
            searcher = IndexSearcher(store)
            ramsearcher = IndexSearcher(context.ramIndex)
            # Number of documents in the collection    
            N = storereader.numDocs()
            # Number of relevant documents            
            R = ramreader.numDocs()
            analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)

            
            for w in self.termList:       
                searchString= "'" + w + "'"
                query = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(searchString)
                # Number of relevant document having the term
                #r = ramsearcher.docFreq(Term("contents",w))
                hits = ramsearcher.search(query,self.MAX)
                r = hits.totalHits    
                # Number of documents having the term
                #n = searcher.docFreq(Term("contents",w))
                query = QueryParser(Version.LUCENE_CURRENT,context.searchField,analyzer).parse(searchString)
                hits = searcher.search(query,self.MAX)
                n = hits.totalHits
                if (R-r) > 0 and (n-r) > 0 and (N-n-R+r) > 0:
                    weight = (r/(R-r))/(((n-r)/(N-n-R+r)))
                else:
                    weight =0
                if weight > self.weightThreshold:            
                    self.gramList.append([w,weight])
            searcher.close()
            ramsearcher.close()
            storereader.close()
            ramreader.close()
        #except Exception,e:
        #    print 'error',e
            
Esempio n. 29
0
def similar(command, docno):
    STORE_DIR = "index"
    initVM(CLASSPATH)
    directory = FSDirectory.getDirectory(STORE_DIR, False)
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer()
    
    parser = QueryParser("contents", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    parser.setFuzzyMinSim(0.2)
    query = parser.parse(command)
    hits = searcher.search(query)
    document = hits.id(docno)

    ir = IndexReader.open(STORE_DIR)
    mlt = MoreLikeThis(ir)
    mlt.setFieldNames(['name', 'contents'])
    mlt.setMinWordLen(2)
    mlt.setBoost(True)
    query = mlt.like(document)
    hits = map(transform, searcher.search(query))
    searcher.close()
    return hits
Esempio n. 30
0
File: searcher.py Progetto: fay/wt
class Searcher(object):
    def __init__(self):
        self.searcher = IndexSearcher(STORE_DIR)
        self.analyzer = CJKAnalyzer()
        self.catfilter = CatFilter()
        
    def __del__(self):
        self.searcher.close()
        
    def search(self, query,category_id=None):
        SHOULD = BooleanClause.Occur.SHOULD
        #MultiFieldQueryParser.setOperator(QueryParser.DEFAULT_OPERATOR_AND);
        parser1 = QueryParser('summary',self.analyzer)
        parser2 = QueryParser('title',self.analyzer)        
        parser1.setDefaultOperator(QueryParser.AND_OPERATOR)
        parser2.setDefaultOperator(QueryParser.AND_OPERATOR)
        q1 = parser1.parse(query)
        q2 = parser2.parse(query)
        boolQuery = BooleanQuery()
        boolQuery.add(q1,SHOULD)
        boolQuery.add(q2,SHOULD)
        
        #camp = CategoryComparatorSource(query)
        #sortfield = SortField("link", camp)
        #sort = Sort(sortfield)
        if category_id:
            self.catfilter.query = query
            self.catfilter.category_id = category_id
            hits = self.searcher.search(boolQuery,self.catfilter)
        else:
            hits = self.searcher.search(boolQuery)
        return hits
    def search_by_field(self,query,field='summary'):
        parser = QueryParser(field,self.analyzer)
        parser.setDefaultOperator(QueryParser.AND_OPERATOR)
        q = parser.parse(query)
        return self.searcher.search(q)
Esempio n. 31
0
class tfidf(Command):
    """
    """
    def __init__(self):
        """
        """
        # Number of documents
        self.N = 0 
        # Number of terms
        self.m = 0
        # Individual terms
        self.unigramList = None
        
    def process(self,context):
        self.unigramList = context.termList
        self.ramreader = IndexReader.open(context.ramIndex,True)
        self.ramsearcher = IndexSearcher(context.ramIndex)
        self.N = self.ramreader.numDocs()
        self.m = len(self.unigramList)
        self.createTermDocumentMatrix()
        self.ramsearcher.close()
        self.ramreader.close()
        context.termdocumentmatrix = self.termdocumentMatrix
        print 'finished creating term document matrix'
        self.context = context
        
        
    def createTermDocumentMatrix(self):
       self.termdocumentMatrix = np.zeros((self.m,self.N),dtype=int)
       analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
       for index,word in enumerate(self.unigramList):
                searchString= "'" + word[0] + "'"
                query = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(searchString)
                hits = self.ramsearcher.search(query,self.N)
                for hit in hits.scoreDocs:
                    self.termdocumentMatrix[index,hit.doc] = hits.totalHits    
def retrieve(string,tweetID):
 global eventNum
 global eventDict
 global eventList
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  searcher = IndexSearcher(dir)
 except lucene.JavaError:
  #print 'Inside First Except'
  eventDict[tweetID] = eventNum
  eventNum = eventNum + 1
  analyzer.close()
  return
 try:
  query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(string)
#e = sys.exc_info()[0]
#print e
  MAX = 2
  hits = searcher.search(query, MAX)
  #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
  #print 'total hits'
  #print hits.totalHits
  if hits.totalHits > 0:
	eventDict[tweetID] = eventDict[hits.scoreDocs[0].doc]
  	analyzer.close()
  	searcher.close()
	return
  else:
	#print '-----------'
	#print tweetID
  	eventDict[tweetID] = eventNum
	eventNum = eventNum + 1
  	analyzer.close()
  	searcher.close()
	return

  #for hit in hits.scoreDocs:
#print hit.score, hit.doc, hit.toString()
      #doc = searcher.doc(hit.doc)
#print doc.get("text").encode("utf-8")
 except lucene.JavaError:
  eventDict[tweetID] = eventNum
  eventNum = eventNum + 1
  analyzer.close()
  searcher.close()
  return
Esempio n. 33
0
def do_invite(keywords):
    print "invite started!>>>>>>"
    initVM()
    indexDir  = "/tmp/luceneindex/doc"
    version   = Version.LUCENE_CURRENT
    idxDir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(version)
    searcher = IndexSearcher(idxDir)
    query = QueryParser(version, "description", analyzer).parse(keywords)
    hits = searcher.search(query, 1000)
    indentCandidates = []

    #print len(hits.scoreDocs)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
        #print doc.get("description")

        intents = doc.get("intent")
        #print doc.get("url")
        if intents == None:
            continue
        intents = eval(intents)

        for intent in intents:
            indentCandidates.append(intent)

    searcher.close()
    inviteEmails = []
    #patterns = ["[^A-Za-z0-9_-]*(?P<buzz>([A-Za-z0-9_-]+(\.\w+)*@(\w+\.)+\w{2,3}))", '''qq[^\d]*(?P<buzz>[1-9][0-9]{4,})''']
    for indentCandidate in indentCandidates:
        #print repr(indentCandidate[0])

        emailCandidate = indentCandidate[0]
        if emailCandidate.find("@") == -1:
            qqMail = emailCandidate+"@qq.com"
            inviteEmails.append(qqMail)
        else:
            inviteEmails.append(emailCandidate)
        # remove useless intent
        #indentCandidate = indentCandidate.strip()

        #probability = bayes.checkneedprobability((indentCandidate).decode("ascii","ignore").encode("utf8"))
        #print probability
        #if (probability>0.5):
        #    continue
        #emailPattern = patterns[0]
        #qqPattern = patterns[1]
        #qqMatches =re.finditer(qqPattern,indentCandidate, re.IGNORECASE| re.DOTALL)
        #emailMatches = re.finditer(emailPattern,indentCandidate, re.IGNORECASE| re.DOTALL)

        #for qqMatch in qqMatches:
        #    qq = qqMatch.group("buzz").strip()
        #print qq
        #    qqMail = qq+"@qq.com"
        #    inviteEmails.append(qqMail)

        #for emailMatch in emailMatches:
        #    email = emailMatch.group("buzz").strip()
            #print email

        #    inviteEmails.append(email)


    #add haiming and rex mail

    #remove multipule emails

    toInviteEmails = set(inviteEmails)
    toInviteEmails.add("*****@*****.**")
    toInviteEmails.add("*****@*****.**")
    toInviteEmails.add("*****@*****.**")
    print "invite total" +str(len(toInviteEmails))+" emails >>>>>>>>>>>"
    #for mail_address in toInviteEmails:
       # print mail_address
    return toInviteEmails
Esempio n. 34
0
        print "Searching for:", command

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        #用analyzer来对查询语句进行词法分析和语言处理。
        #QueryParser调用parser进行语法分析,形成查询语法树,放到Query中。
        scoreDocs = searcher.search(query, 50).scoreDocs
        #IndexSearcher调用search对查询语法树Query进行搜索,得到结果
        print "%s total matching documents." % len(scoreDocs), '\n'

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'title:', doc.get("title")
            print 'url:', doc.get("url")
            print 'name:', doc.get("name")


if __name__ == '__main__':
    STORE_DIR = "index"
    initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))  #索引文件存放的位置
    searcher = IndexSearcher(directory, True)  #索引信息读入到内存,创建IndexSearcher准备进行搜索
    analyzer = lucene.WhitespaceAnalyzer(
        Version.LUCENE_CURRENT
    )  #analyzer用来对查询语句进行词法分析和语言处理的,和IndexFiles.py中使用同样的analyzer。
    run(searcher, analyzer)
    searcher.close()
def queryenhancement(query, swords, k, fList, path, TotalN):
    if not os.path.isdir(path):
        print("No directory named %s" % os.path.abspath(path))
        return
    if len(fList) == 0:
        return ""
    wordcount = {}
    N = len(fList)
    # def length of docs
    doclen = np.zeros(N)
    # def docs query
    docquery = np.zeros(N)
    # proximity to query word
    proxword = {}
    # tokenize query
    queryword = re.findall('[0-9a-zA-Z]+', query)
    for i in range(N):
        fname = fList[i]
        fp = open(path + "/" + str(fname))
        # tokenize doc
        docword = [s.lower() for s in re.findall("[0-9a-zA-Z]+", fp.read())]
        # length of docs
        doclen[i] = len(docword)  # l(D)
        # store the location of query words
        queryloc = []
        for j in range(len(docword)):
            # occurence of words
            if docword[j] not in wordcount:
                wordcount[docword[j]] = np.zeros(N)
                wordcount[docword[j]][i] = 1
            else:
                wordcount[docword[j]][i] += 1
            # record token location
            if docword[j].lower() in queryword:
                docquery[i] += 1  # m(Q,D)
                queryloc.append(j)
        proxanc = [
            filter(lambda y: y < 6 and y > 0, range(x - 5, x + 6))
            for x in queryloc
        ]
        for ancL in proxanc:
            for anc in ancL:
                if docword[anc] not in proxword:
                    proxword[docword[anc]] = np.zeros(N)
                    proxword[docword[anc]][i] = 1
                else:
                    proxword[docword[anc]][i] += 1
        fp.close()
    # exclude stopwords and query words
    for w in wordcount.keys():
        if w in queryword or w in swords:
            wordcount.pop(w)  # c(W,D)
    for w in proxword.keys():
        if w in queryword or w in swords:
            proxword.pop(w)
    # align proximity measurement and word frequency
    for w in wordcount.keys():
        if w not in proxword:
            proxword[w] = np.zeros(N)  # f(W,Q,D)
    rwordocc = {w: np.sum(wordcount[w] != 0) for w in wordcount}  # z(W,Q)

    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    ireader = IndexSearcher(directory, True)
    docfreq = {
        w: ireader.docFreq(Term("contents", w.lower()))
        for w in wordcount
    }  # g(W)
    ireader.close()
    # label the words
    wordtag = np.array(wordcount.keys())
    wordlabel = {x: word for x, word in enumerate(wordtag)}

    score1 = np.zeros([len(wordcount), N])
    score2 = np.zeros([len(wordcount), N])
    y = np.zeros(len(wordcount))
    for i in range(len(wordcount)):
        w = wordlabel[i]
        score1[i] = proxword[w]
        score2[i] = wordcount[w]
        y[i] = np.max([0, (rwordocc[w] * 1. / N - 2. * docfreq[w] / TotalN)])
    score1 *= 1. / docquery
    score2 *= 1. / doclen
    score1 = np.sqrt(score1)
    score2 = np.sqrt(score2)
    vals = np.sum(score1 + (y * score2.T).T, axis=1)
    idx = np.argsort(vals)[::-1]
    print vals[idx[:100]]
    print docquery
    ret = ""
    count = 0
    for i in range(len(idx)):
        # if wordtag[idx[i]] not in query and wordtag[idx[i]] not in swords:
        ret += " " + wordtag[idx[i]]
        count += 1
        if count < k:
            continue
        else:
            break
    # print ret
    return ret
Esempio n. 36
0
class SynonymAnalyzerTest(TestCase):

    synonymAnalyzer = SynonymAnalyzer(MockSynonymEngine())

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.synonymAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("content", "The quick brown fox jumps over the lazy dogs",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)

    def tearDown(self):

        self.searcher.close()

    def testJumps(self):

        stream = self.synonymAnalyzer.tokenStream("contents",
                                                  StringReader("jumps"))
        term = stream.addAttribute(TermAttribute.class_)
        posIncr = stream.addAttribute(PositionIncrementAttribute.class_)

        i = 0
        expected = ["jumps", "hops", "leaps"]
        while stream.incrementToken():
            self.assertEqual(expected[i], term.term())
            if i == 0:
                expectedPos = 1
            else:
                expectedPos = 0

            self.assertEqual(expectedPos, posIncr.getPositionIncrement())
            i += 1

        self.assertEqual(3, i)

    def testSearchByAPI(self):

        tq = TermQuery(Term("content", "hops"))
        topDocs = self.searcher.search(tq, 50)
        self.assertEqual(1, topDocs.totalHits)

        pq = PhraseQuery()
        pq.add(Term("content", "fox"))
        pq.add(Term("content", "hops"))
        topDocs = self.searcher.search(pq, 50)
        self.assertEquals(1, topDocs.totalHits)

    def testWithQueryParser(self):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            self.synonymAnalyzer).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        # in Lucene 1.9, position increments are no longer ignored
        self.assertEqual(1, topDocs.totalHits, "!!!! what?!")

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            StandardAnalyzer(
                                Version.LUCENE_CURRENT)).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits, "*whew*")

    def main(cls):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            cls.synonymAnalyzer).parse('"fox jumps"')
        print "\"fox jumps\" parses to ", query.toString("content")

        print "From AnalyzerUtils.tokensFromAnalysis: "
        AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"")
        print ''

    main = classmethod(main)
Esempio n. 37
0
File: plush.py Progetto: jjguy/plush
class PlushBase:
    """Base class."""

    def __init__(self, store_dir=None, verbose=False):
        self.verbose = verbose
        self.store_path = None
        self.searcher = None
        self.index_reader = None
        self.directory = None
        self.analyzers = {}
        self.initAnalyzers()
        self.default_analyzer_id = 'Simple'
        self.fields = []
        self._connected = False
        if store_dir:
            self.openStore(store_dir)

    def __del__(self):
        self.closeStore()
        self._connected = False

    def initDummyStore(self, directory):
        """Open a dummy ramdirectory for testing."""
        writer = IndexWriter(directory, SimpleAnalyzer(), True)
        doc = Document()
        doc.add(Field("name", 'dummy.txt', Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
        doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
        doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
        doc.add(Field("contents", "foo dummy bar", Field.Store.YES,
                      Field.Index.TOKENIZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

    def openStore(self, store_dir):
        """Open a lucene store."""
        if self._connected:
            self.closeStore()
        if store_dir == 'dummy':
            directory = RAMDirectory()
            self.initDummyStore(directory)
            store_path = store_dir
        else:
            store_path = os.path.abspath(store_dir)
            try:
                directory = SimpleFSDirectory(File(store_path)) #TODO , False)
            except JavaError:
                print "Error: %s Not found." % store_path
                return
        try:
            self.searcher = IndexSearcher(directory)
        except JavaError:
            print "Error: '%s' is not a valid lucene store." % store_path
            return
        print 'Opening store: %s' % store_path
        self.directory = directory
        self.store_path = store_path
        # TODO - TaxonomyReader??
        self.index_reader = IndexReader.open(directory)
        self.fields = self.getFieldNames()
        self.fields.sort()
        self._connected = True

    def closeStore(self):
        """Close a lucene store."""
        if self.searcher is not None:
            if self.verbose:
                print "Close searcher."
            self.searcher.close()
            self.directory = None
            self.searcher = None
            self.index_reader = None
            self.fields = []
            self.store_path = None
        self._connected = False

    def maxDoc(self):
        """Maximum doc number."""
        return self.index_reader.maxDoc()

    def numDocs(self):
        """Number of docs in the store."""
        return self.index_reader.numDocs()

    def getFieldNames(self):
        """Return a unique list of field names that exist in this index."""
        fields = {}
        terms  = self.index_reader.terms()
        while terms.next():
            fields[terms.term().field()] = True
        return fields.keys()

        # TODO
        #if VERSION.startswith('1.9'):
        #    return self.index_reader.getFieldNames()
        #return self.index_reader.getFieldNames(IndexReader.FieldOption.ALL)

    def getFields(self, doc_num=None):
        """Return fields of a doc."""
        if doc_num is None:
            doc_num = self.maxDoc() - 1
        doc = self.index_reader.document(doc_num)
        return doc.fields()

    def getDoc(self, doc_num=None):
        """Return a lucene doc."""
        if doc_num is None:
            doc_num = self.maxDoc() - 1
        return self.index_reader.document(doc_num)

    def getFieldInfos(self, doc_num=None):
        """Return fields description.

        [(name, stored, index, token, binary, compressed), ...]"""
        fields = []
        doc = self.getDoc(doc_num)
        for name in self.fields:
            # TODO - this form of getFields() is deprecated 
            mfields= doc.getFields(name)
            if not mfields:
                fields.append((name, False, False, False, False, False,
                               'N/A'))
                continue
            for field in mfields:
                fields.append((field.name(), field.isStored(),
                               field.isIndexed(),
                               field.isTokenized(), field.isBinary(),
                               False, field.stringValue()))
                               #TODO
                               #field.isCompressed(), field.stringValue()))

        return fields

    def search(self, command, field_id="contents", sort_on=None,
               sort_order=False, analyzer_id=None):
        """Do the lucene search."""
        analyzer = self.getAnalyzer(analyzer_id)
        try:
            if VERSION.startswith('1.9'):
                query = QueryParser.parse(command, field_id, analyzer)
            else:
                query = QueryParser(field_id, analyzer).parse(command)
        except JavaError:
            print "Error: Lucene cannot parse this query."
            return None
        if sort_on:
            return self.searcher.search(query, Sort(sort_on, sort_order))
        return self.searcher.search(query)

    def getTermFreqs(self, field=None, max_term=None, pattern=None):
        """Return a list ([(num occurence, term), ..], number of term)"""
        item = self.index_reader.terms()
        min_freq = 0
        freqs = []
        if max_term:
            limit = max_term
        else:
            limit = 1000
        if pattern is not None:
            try:
                pat = re.compile(pattern)
            except RegexError:
                print "Error: '%s' is an invalid regex" % pattern
                return [], 0
        count = 0
        while(item.next()):
            term = item.term()
            if field and term.field() != field:
                continue
            count += 1
            word = term.text()
            freq = item.docFreq()
            if pattern is not None and not pat.search(word):
                continue
            if len(freqs) >= limit and freq < min_freq:
                continue
            freqs.append((-1 * freq, word))
            freqs.sort()
            if len(freqs) > limit:
                freqs.pop()
            min_freq = freqs[0][0]
        item.close()
        freqs = [(-1*freq, word) for freq, word in freqs]
        return freqs, count

    def initAnalyzers(self):
        """Init all analyzer."""
        # TODO
        #self.analyzers['French'] = FrenchAnalyzer()
        #self.analyzers['German'] = GermanAnalyzer()
        self.analyzers['Keyword'] = KeywordAnalyzer(Version.LUCENE_CURRENT)
        self.analyzers['Simple'] = SimpleAnalyzer(Version.LUCENE_CURRENT)
        self.analyzers['Stop'] = StopAnalyzer(Version.LUCENE_CURRENT)
        self.analyzers['Standard'] = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzers['Whitespace'] = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        nxlucene_home = os.getenv('NXLUCENE_HOME', None)
        if nxlucene_home:
            # point to http://svn.nuxeo.org/pub/NXLucene/trunk/src/nxlucene
            nxlucene_home = os.path.normpath(nxlucene_home)
            sys.path.append(nxlucene_home)
            try:
                from analysis import analyzers_map
            except ImportError:
                print "Error: Invalid NXLUCENE_HOME %s" % nxlucene_home
                return
            for key, value in analyzers_map.items():
                self.analyzers['nx' + key] = value
            print "NXLucene analyzers loaded."

    def getAnalyzer(self, analyzer_id=None):
        """Return an analyzer or default."""
        if analyzer_id is None:
            analyzer_id = self.default_analyzer_id
        return self.analyzers.get(analyzer_id)

    def displayAnalyzedQuery(self, text, field_name, analyzer_id=None):
        """Print analyzed tokens."""
        analyzer = self.getAnalyzer(analyzer_id)
        tokens = [token.termText() for token in analyzer.tokenStream(field_name, StringReader(text))]
        print "  %s analyzer tokens: %s" % (analyzer_id or self.default_analyzer_id, ", ".join(tokens) )
Esempio n. 38
0
def search2():
    results0=[]
    results2=[]
    loc=[]
    sr=''
    tmp='' 
    mark=False
    if request.method == 'POST':
        try:                        
            initVM()       
            directory = SimpleFSDirectory(File('albumIndex'))
            searcher2 = IndexSearcher(directory, True)
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

            if "Search" in request.form.values():
                sr=request.form['text']
                
            elif "Shuffle" in request.form.values():
                mark=True
                while len(loc)<20:
                    tmp=random.randint(0,searcher2.maxDoc()-1)
                    if tmp not in loc:
                        loc+=[tmp]
                

            if mark:
                print 'loc=',loc
                ct=0
                for i in loc:
                    doc = searcher2.doc(i)
                    songs=doc.get('albumsongs')
                    songs=songs.split('!@#$%')
                    urls=doc.get("albumsongURLs")
                    urls=urls.split('!@#$%')
                    results2+=[{'albumnum': doc.get("albumnum"),\
                               'albumname':doc.get('albumname'),\
                                'albumartist':doc.get('albumartist'),\
                               'albumintro': doc.get("albumintro"),\
                               'albumsongs':songs,\
                               'albumsongURLs': urls,\
                               'albumpicURL':doc.get('albumpicURL'),\
                                'albumartistURL':doc.get('albumartistURL'),\
                                'albumURL':doc.get('albumURL'),\
                               'rank':100}]
            else:
                print  request.form.values()
                print 'sr=',sr
                if sr=='':
                    return results0,results2,""
##                for i in sr:
##                    tmp+=i+" "
##                print tmp
                scoreDocs=run2(searcher2, analyzer,sr,1) #search exact album

                if scoreDocs!=False:
                    doc=scoreDocs
                    songs=doc.get('albumsongs')
                    songs=songs.split('!@#$%')
                    urls=doc.get("albumsongURLs")
                    urls=urls.split('!@#$%')
                    results2+=[{'albumnum': doc.get("albumnum"),\
                               'albumname':doc.get('albumname'),\
                                'albumartist':doc.get('albumartist'),\
                               'albumintro': doc.get("albumintro"),\
                               'albumsongs':songs,\
                               'albumsongURLs': urls,\
                               'albumpicURL':doc.get('albumpicURL'),\
                                'albumartistURL':doc.get('albumartistURL'),\
                                'albumURL':doc.get('albumURL'),\
                               'rank':100}] 
                    results0=results2
                else:
                    scoreDocs=run2(searcher2, analyzer,sr,20) #search 20 albums
                    rank=100
                    for scoreDoc in scoreDocs:
                        doc = searcher2.doc(scoreDoc.doc)
                        songs=doc.get('albumsongs')
                        songs=songs.split('!@#$%')
                        urls=doc.get("albumsongURLs")
                        urls=urls.split('!@#$%')
                        results2+=[{'albumnum': doc.get("albumnum"),\
                               'albumname':doc.get('albumname'),\
                                'albumartist':doc.get('albumartist'),\
                               'albumintro': doc.get("albumintro"),\
                               'albumsongs':songs,\
                               'albumsongURLs': urls,\
                               'albumpicURL':doc.get('albumpicURL'),\
                                'albumartistURL':doc.get('albumartistURL'),\
                                'albumURL':doc.get('albumURL'),\
                                    'rank':rank}]
                        rank-=5
            conn = MySQLdb.connect(host='localhost', user='******',passwd='1234',charset="utf8") 
            # conn = MySQLdb.connect(host='localhost', user='******',passwd='ee208',charset="utf8")
            conn.select_db('coversearch');
            cursor = conn.cursor()
            
            for i in results2:
                try:
                    cursor.execute("select zan from albums where id="+i['albumnum'])
                    zan=cursor.fetchone()[0]
                    i['zan']=zan
                    i['rank']+=int(zan)
                except:
                    i['zan']=0
            results2.sort(key=lambda x:x['rank'],reverse=True)
            results2=resortalbum(results2,sr)
            conn.commit()
            cursor.close() 
            conn.close()
            searcher2.close()
        except Exception,e:
            print 2,e
Esempio n. 39
0
def search():
    results=[]
    results3=[]
    loc=[]
    sr=''
    tmp='' 
    mark=False
    if request.method == 'POST':
        try:                        
            initVM()       
            directory = SimpleFSDirectory(File("songIndex"))
            searcher = IndexSearcher(directory, True)
            directory = SimpleFSDirectory(File("artistIndex"))
            searcher3 = IndexSearcher(directory, True)
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

            if "Search" in request.form.values():
                sr=request.form['text']
                
            elif "Shuffle" in request.form.values():
                mark=True
                while len(loc)<20:
                    tmp=random.randint(0,searcher.maxDoc()-1)
                    if tmp not in loc:
                        loc+=[tmp]
                
##            if request.form['action']=="Search":
##                sr=request.form['text']
##            elif request.form['action']=="Shuffle":
##                sr='1'


            if mark:
                print 'loc=',loc
                for i in loc:
                    doc = searcher.doc(i)
                    results+=[{'songname':doc.get("songname"),\
                               'songurl':doc.get('songurl'),\
                               'albumname':doc.get('songalbum'),\
                               'songartist':doc.get('songartist'),\
                               'albumurl': doc.get("songalbumURL"),\
                               'picPath':doc.get('songpicURL'),\
                               }]                   
            else:
                print  request.form.values()
                print 'sr=',sr
                if sr=='':
                    return results,results3,""
                for i in sr:
                   tmp+=i+" "
##                print tmp
##                scoreDocs=run2(searcher2, analyzer,sr)
##                if len(scoreDocs)!=0:
##                    doc=searcher2.doc(scoreDocs[0].doc)
##                    results2+=[{'albumnum:', doc.get("albumnum"),\
##                               'albumname:',doc.get('albumname'),\
##                                'albumartist:',doc.get('albumartist'),\
##                               'albumintro:', doc.get("albumintro"),\
##                               'albumsongs:',doc.get('albumsongs'),\
##                               'albumsongURLs:', doc.get("albumsongURLs"),\
##                               'albumpicURL:',doc.get('albumpicURL')}]
##                else:
                scoreDocs=run3(searcher3,analyzer,sr)
                if scoreDocs == False:
                    scoreDocs=run(searcher, analyzer,sr)
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        results+=[{'songname':doc.get("songname"),\
                                   'songurl':doc.get('songurl'),\
                                   'albumname':doc.get('songalbum'),\
                                   'songartist':doc.get('songartist'),\
                                   'albumurl': doc.get("songalbumURL"),\
                                   'picPath':doc.get('songpicURL')\
                                   }]
                else:
                    doc=scoreDocs
                    singeralbums=doc.get('singeralbums')
                    singeralbums=singeralbums.split('!@#$%')
                    singeralbumURLs=doc.get("singeralbumURLs")
                    singeralbumURLs=singeralbumURLs.split('!@#$%')
                    results3+=[{'singername': doc.get("singername"),\
                                'singerplace':doc.get('singerplace'),\
                                'singerintro':doc.get('singerintro'),\
                                'singeralbums': singeralbums,\
                               'singeralbumURLs':singeralbumURLs,\
                                'singerpicURL': doc.get("singerpicURL")\
                                }]
            searcher.close()
        except Exception,e:
            print 1,e
Esempio n. 40
0
            t =c_buffer(strlen*6)
            bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(command),c_int(strlen),t,c_int(0),0)
            command=t.value.decode('gbk').encode('utf8')
            ##list=t.value.split()
            ##print ' '.join(list)
            dll.ICTCLAS_Exit()
            command=command.decode('utf8')
        if command == '':
            return
        print
        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'title:', doc.get("title"), 'url:', doc.get("url"), 'name:', doc.get("name")


if __name__ == '__main__':
    STORE_DIR = "F:\\index"
    initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    searcher.close()
Esempio n. 41
0
class SynonymAnalyzerTest(TestCase):

    synonymAnalyzer = SynonymAnalyzer(MockSynonymEngine())

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.synonymAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("content",
                      "The quick brown fox jumps over the lazy dogs",
                      Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)

    def tearDown(self):

        self.searcher.close()

    def testJumps(self):

        stream = self.synonymAnalyzer.tokenStream("contents",
                                                  StringReader("jumps"))
        term = stream.addAttribute(TermAttribute.class_)
        posIncr = stream.addAttribute(PositionIncrementAttribute.class_)

        i = 0
        expected = ["jumps", "hops", "leaps"]
        while stream.incrementToken():
            self.assertEqual(expected[i], term.term())
            if i == 0:
                expectedPos = 1
            else:
                expectedPos = 0

            self.assertEqual(expectedPos, posIncr.getPositionIncrement())
            i += 1

        self.assertEqual(3, i)

    def testSearchByAPI(self):

        tq = TermQuery(Term("content", "hops"))
        topDocs = self.searcher.search(tq, 50)
        self.assertEqual(1, topDocs.totalHits)

        pq = PhraseQuery()
        pq.add(Term("content", "fox"))
        pq.add(Term("content", "hops"))
        topDocs = self.searcher.search(pq, 50)
        self.assertEquals(1, topDocs.totalHits)

    def testWithQueryParser(self):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            self.synonymAnalyzer).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        # in Lucene 1.9, position increments are no longer ignored
        self.assertEqual(1, topDocs.totalHits, "!!!! what?!")

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits, "*whew*")

    def main(cls):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            cls.synonymAnalyzer).parse('"fox jumps"')
        print "\"fox jumps\" parses to ", query.toString("content")

        print "From AnalyzerUtils.tokensFromAnalysis: "
        AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"")
        print ''
        
    main = classmethod(main)