Esempio n. 1
0
 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(
         File(os.path.join(self.directory, 'Index')))
     self.taxoDir = FSDirectory.open(
         File(os.path.join(self.directory, 'Taxonomy')))
Esempio n. 2
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return
            
        docsInIndex  = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(tempfile.gettempdir(),
                                'index-dir')
        dir = FSDirectory.open(indexDir,)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = tempfile.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(Field("fieldname", "Bibamus",
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)
Esempio n. 4
0
    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return

        docsInIndex = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        dir = FSDirectory.getDirectory(indexDir, True)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = System.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(
                Field("fieldname", "Bibamus", Field.Store.YES,
                      Field.Index.TOKENIZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)
    def __init__(self, user_loc_string, debug=False):
        analyzer = StopAnalyzer()
        fields = ["name", "alternate_names", "state"]
        directory = FSDirectory.getDirectory("index")

        self.DEBUG = debug
        self.searcher = IndexSearcher(directory)
        self.parser = MultiFieldQueryParser(fields, analyzer)
        self.user_location = self.doSearch(user_loc_string)
Esempio n. 6
0
    def __init__(self, user_loc_string, debug=False):
        analyzer = StopAnalyzer()
        fields = ['name', 'alternate_names', 'state']
        directory = FSDirectory.getDirectory("index")

        self.DEBUG = debug
        self.searcher = IndexSearcher(directory)
        self.parser = MultiFieldQueryParser(fields, analyzer)
        self.user_location = self.doSearch(user_loc_string)
Esempio n. 7
0
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = FSDirectory.getDirectory(storeDir, True)
        writer = IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer)
        print 'optimizing index',
        writer.optimize()
        writer.close()
        print 'done'
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = FSDirectory.getDirectory(storeDir, True)
        writer = IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer)
        print 'optimizing index',
        writer.optimize()
        writer.close()
        print 'done'
Esempio n. 9
0
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" %(dataDir)

        writer = IndexWriter(FSDirectory.open(indexDir), StandardAnalyzer(Version.LUCENE_CURRENT), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(False)

        numIndexed = cls.indexDirectory(writer, dataDir)
        writer.optimize()
        writer.close()

        return numIndexed
Esempio n. 10
0
def search(q):
    
    initLucene()
    fsDir = FSDirectory.getDirectory(INDEX_PATH, False)
    searcher = IndexSearcher(fsDir)
    query = QueryParser("contents", StandardAnalyzer()).parse(q)
    start = time()
    hits = searcher.search(query)
    duration = timedelta(seconds=time() - start)
    matchpages = []
    for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            temp = ({"title":doc["title"],"url":doc["url"]})
            matchpages.append(temp)
    result = {"no_of_hits":hits.length(),"duration":duration, "query":q,"result":matchpages}
    return result
Esempio n. 11
0
def run(command):
    if command == '':
        return None
    STORE_DIR = "index"
    initVM(CLASSPATH)
    directory = FSDirectory.getDirectory(STORE_DIR, False)
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer()
    
    parser = QueryParser("contents", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    parser.setFuzzyMinSim(0.2)
    query = parser.parse(command)
    hits = map(transform, searcher.search(query))
    searcher.close()
    return hits
Esempio n. 12
0
    def index(self):

        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                               "verbose-index")
        dir = FSDirectory.getDirectory(dirPath, True)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(System.out)

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Esempio n. 13
0
    def search(cls, indexDir, q):

        fsDir = FSDirectory.getDirectory(indexDir, False)
        searcher = IndexSearcher(fsDir)

        query = QueryParser("contents", StandardAnalyzer()).parse(q)
        start = time()
        hits = searcher.search(query)
        duration = timedelta(seconds=time() - start)

        # result = {"no_of_hits":hits.length(),"duration":duration, "query":q,}
        # return
        print "Found %d document(s) (in %s) that matched query '%s':" % (hits.length(), duration, q)

        for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            print doc
Esempio n. 14
0
    def __init__(self):
        self.save = Field.Store.YES
        self.ddict = dict()

        lvm.attachCurrentThread()

        if not WorkflowIndexer.store and not WorkflowIndexer.writer:
            try:
                # open lucene index
                WorkflowIndexer.store = FSDirectory.open(File(location))
                # if the index directory does not exist, create it.
                WorkflowIndexer.writer = IndexWriter( WorkflowIndexer.store,
                    vistrailAnalyzer(), not len(WorkflowIndexer.store.list()))
            except Exception, e:
                print "EXCEPTION", e
                self.close()
                raise
    def index(self):

        dirPath = os.path.join(tempfile.gettempdir(),
                               "verbose-index")
        dir = FSDirectory.open(dirPath)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(InfoStreamOut())

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Esempio n. 16
0
def similar(command, docno):
    STORE_DIR = "index"
    initVM(CLASSPATH)
    directory = FSDirectory.getDirectory(STORE_DIR, False)
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer()
    
    parser = QueryParser("contents", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    parser.setFuzzyMinSim(0.2)
    query = parser.parse(command)
    hits = searcher.search(query)
    document = hits.id(docno)

    ir = IndexReader.open(STORE_DIR)
    mlt = MoreLikeThis(ir)
    mlt.setFieldNames(['name', 'contents'])
    mlt.setMinWordLen(2)
    mlt.setBoost(True)
    query = mlt.like(document)
    hits = map(transform, searcher.search(query))
    searcher.close()
    return hits
    emo_propagation_by_time = sorted(emoticon_propagation_hash.items(), key=itemgetter(0))
    emoticon_file = open(emoticon_file_name,'w')
    emoticon_file.write("day,"+",".join(countrylist)+",total,totalinreplies,alltweets,emoticontweets,httpemoticons,UStweets,JPtweets\n")        
    for emo_day_entry in emo_propagation_by_time:
        emoticon_file.write(str(emo_day_entry[0])+","+",".join([str(emo_day_entry[1].get(ctry,0)) for ctry in countrylist]) + "," + \
                            str(emo_day_entry[1]["total"]) + "," + str(emo_day_entry[1]["total_in_replies"]) + "," + str(emo_day_entry[1]['total tweets']) + "," + \
                            str(emo_day_entry[1]["total emoticon tweets"]) + "," + str(emo_day_entry[1]["total http emoticons"]) + "," + \
                            str(emo_day_entry[1]["total US tweets"]) + "," + str(emo_day_entry[1]["total JP tweets"]) + "\n")
    emoticon_file.close()
    print "done at: ", time.time()

if __name__ == '__main__':
    STORE_DIR =  "/Volumes/TerraFirma/SharedData/vdb5/lucene_index"
    initVM(CLASSPATH, maxheap='1024m')
    print 'lucene', VERSION
    directory = FSDirectory.getDirectory(STORE_DIR, False)
    searcher = IndexSearcher(directory)
    analyzer = WhitespaceAnalyzer()
    getBaselineStatistics(searcher, analyzer)
    #emoticon_list = [":)", ":(", ";)", ":P", ":0", "^^", "TT", ":p", ":/", "^_^", "T_T"]
    #emoticon_list = [":)", ":(", ":'(", ":-|", "^^", "+_+", "-_-", "T_T"]
    emoticon_list = [":)",":D",":(",";)",":P","^^","^_^","-_-","T_T",":o","@_@"]
    for prop_emoticon in emoticon_list: getEmoticonPropagationCurves(prop_emoticon, searcher, analyzer)
    #getEmoticonPropagationCurves(":)", searcher, analyzer)
    #getEmoticonPropagationCurves(":(", searcher, analyzer)
    #getEmoticonPropagationCurves("^_^", searcher, analyzer)
    #getEmoticonPropagationCurves(";)", searcher, analyzer)
    #getEmoticonPropagationCurves("TT", searcher, analyzer)
    #getEmoticonPropagationCurves("=^", searcher, analyzer)
    searcher.close()
def run(searcher, parser):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query: ")
        if command == '':
            return

        print "Searching for:", command
        query = parser.parse(command)
        hits = searcher.search(query, Sort("population", True))
        print "%s total matching documents." % hits.length()

        for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            print 'name:', doc.get("name"), ' state:', doc.get("state")


if __name__ == '__main__':
    STORE_DIR = "index"
    initVM(CLASSPATH)
    print 'lucene', VERSION
    directory = FSDirectory.getDirectory(STORE_DIR)
    searcher = IndexSearcher(directory)
    analyzer = StopAnalyzer()
    parser = QueryParser("all_names", analyzer)
    parser.setDefaultOperator(parser.AND_OPERATOR)
    run(searcher, parser)
    searcher.close()

Esempio n. 19
0
import sys, shutil
from itertools import count

from lucene import (IndexWriter, StandardAnalyzer, Document, Field,
        MultiFieldQueryParser, IndexSearcher, initVM, CLASSPATH, Hit,
        FSDirectory, BooleanClause)
initVM(CLASSPATH)

DIRECTORY = 'xxindex'
STORE = FSDirectory.getDirectory(DIRECTORY, True)

def indexDoc(writer, d):
    doc = Document()
    name_ = Field("name_", d.name,
                  Field.Store.YES, Field.Index.TOKENIZED)
    name_.setBoost(2.0)
    full_text = Field("full_text", d.full,
                  Field.Store.YES, Field.Index.TOKENIZED)
    id = Field("id", str(d.id),
                  Field.Store.YES, Field.Index.UN_TOKENIZED)
    doc.add(name_)
    doc.add(full_text)
    doc.add(id)

    writer.addDocument(doc)


class MyHit(object):
    """One monster search result
    Essentially an adapter for Lucene's hits
    """