Esempio n. 1
0
def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
Esempio n. 2
0
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
Esempio n. 3
0
 def fn():
     env.attachCurrentThread()
     start = datetime.now()
     IndexFiles(sys.argv[1], "index",
                StandardAnalyzer(Version.LUCENE_CURRENT))
     end = datetime.now()
     print end - start
Esempio n. 4
0
    def post(self):
      q= self.get_argument("query")

      # self.write(key)

    # def query(query):
      # query = self.get_argument("q")
      lucene.initVM()
      indexDir = "index"
      dir = SimpleFSDirectory(File(indexDir))
      analyzer = StandardAnalyzer(Version.LUCENE_30)
      searcher = IndexSearcher(dir)
      
      query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
      MAX = 10
      hits = searcher.search(query, MAX)
      
      print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
      items = []
      rQ = []
      
      #for key, value in doc_urls.iteritems() 
       # print (key, value)

      for hit in hits.scoreDocs:
          #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
          print hit.score, hit.doc, hit.toString()
          print(len(doc_urls))
          items.append(doc_urls[str(hit.doc)])
          doc = searcher.doc(hit.doc) 
          print(hit.doc)
        
      self.render("index.html", title="Results", items=items, query=q)
Esempio n. 5
0
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
Esempio n. 6
0
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
Esempio n. 7
0
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">",
                                        "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
Esempio n. 8
0
 def __init__(self):
     self.__dict__ = self.__shared_state
     if not self.__shared_state:
         self.jccvm = lucene.initVM()
         self.index = SimpleFSDirectory(
             lucene.File(settings.lucene_index_dir))
         self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
Esempio n. 9
0
def luceneRetriver(query):

    lucene.initVM()

    indir = SimpleFSDirectory(File(INDEXDIR))

    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

    lucene_searcher = IndexSearcher(indir)

    my_query= QueryParser(Version.LUCENE_30,"text",\

    lucene_analyzer).parse(query)

    MAX = 1000

    total_hits = lucene_searcher.search(my_query, MAX)

    print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs:

        print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString(
        )

        doc = lucene_searcher.doc(hit.doc)

        print doc.get("text").encode("utf-8")
Esempio n. 10
0
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
Esempio n. 11
0
def names():
    lst = []

    search = "spax"  #request.form['product']
    lucene.initVM()

    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)

    for hit in hits.scoreDocs:
        if hit.score >= 1:
            print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(',')
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
                    lst.append(item)
    #print lst
    data = {"products": lst}
    if request.method == 'POST':
        return jsonify(data)
    else:
        return jsonify(data)
Esempio n. 12
0
def index_files(files, index_directory):
    lucene.initVM()
    d = SimpleFSDirectory(File(index_directory))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
    for f in files:
        parse_file(f, writer)
    writer.optimize()
    writer.close()
    def open(self, name, txn, **kwds):

        super(IndexContainer, self).open(name, txn, **kwds)

        if kwds.get('create', False):
            directory = self.getDirectory()
            indexWriter = IndexWriter(directory, StandardAnalyzer(), True)
            indexWriter.close()
            directory.close()
Esempio n. 14
0
def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term

    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    try:
        file = open('../features.txt', 'r')

        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line

        querywordlist = query.split()

        featureQueryList = []
        productQueryList = []

        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title",
                                         aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "

            print featureQuery

        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "

        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)
def SearchFiles(command):
    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    # print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    rankedfiles = run(searcher, analyzer, command)
    searcher.close()
    return rankedfiles
Esempio n. 16
0
def search(command):
    STORE_DIR = "index"
    vm_env = initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, command)
    searcher.close()
    return result
Esempio n. 17
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field",
                        analyzer).parse('"This is Some Phrase*"')
        self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
Esempio n. 18
0
def getResultScoreDocs(query):
    # create analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # create parser for user submitted query
    parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    formatted_query = parser.parse(query)
    scoreDocs = searcher.search(formatted_query, 50).scoreDocs

    return scoreDocs
    def commitIndexWriter(self, writer):

        directory = writer.getDirectory()
        writer.close()
        dbDirectory = self.getDirectory()
        dbWriter = IndexWriter(dbDirectory, StandardAnalyzer(), False)
        dbWriter.setUseCompoundFile(False)
        dbWriter.addIndexes([directory])
        directory.close()
        dbWriter.close()
        dbDirectory.close()
Esempio n. 20
0
    def main(cls, argv):

        print "SimpleAnalyzer"
        cls.displayTokensWithFullDetails(SimpleAnalyzer(),
                                         "The quick brown fox....")

        print "\n----"
        print "StandardAnalyzer"
        cls.displayTokensWithFullDetails(
            StandardAnalyzer(Version.LUCENE_CURRENT),
            "I'll e-mail you at [email protected]")
Esempio n. 21
0
    def testWithQueryParser(self):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            self.synonymAnalyzer).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        # in Lucene 1.9, position increments are no longer ignored
        self.assertEqual(1, topDocs.totalHits, "!!!! what?!")

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            StandardAnalyzer(
                                Version.LUCENE_CURRENT)).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits, "*whew*")
    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(indexDir,
                             StandardAnalyzer(Version.LUCENE_CURRENT), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith('.properties'):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.optimize()
        writer.close()
Esempio n. 23
0
def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)
Esempio n. 24
0
def build_perfield_analyzer(index_fields):
    """
    This function creates a PerFieldAnalyzerWrapper which allows us 
    to associate different Lucene Analyzers to specific fields in our 
    Lucene index. 
    """
    analyzer = PerFieldAnalyzerWrapper(StandardAnalyzer(
        Version.LUCENE_CURRENT))

    for (index_name, analyzer_type) in index_fields.iteritems():
        if analyzer_type == "standard":
            continue

        analyzer.addAnalyzer(index_name, ANALYZER_LOOKUP.get(analyzer_type))

    return analyzer
Esempio n. 25
0
    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")
Esempio n. 26
0
    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
Esempio n. 27
0
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (
                dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
Esempio n. 28
0
def luceneIndexer(docdir, indir):
    """

         IndexDocuments from a directory

         """

    lucene.initVM()

    DIRTOINDEX = docdir

    INDEXIDR = indir

    indexdir = SimpleFSDirectory(File(INDEXIDR))

    analyzer = StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))

    for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):

        print "Indexing: ", tfile

        document = Document()

        content = open(tfile, 'r').read()

        document.add(Field("text",content,Field.Store.YES,\

                 Field.Index.ANALYZED))

        index_writer.addDocument(document)

        print "Done: ", tfile

    index_writer.optimize()

    print index_writer.numDocs()

    index_writer.close()
Esempio n. 29
0
    def someMethod(self):

        directory = RAMDirectory()

        analyzer = StandardAnalyzer()
        writer = IndexWriter(directory, analyzer, True)

        doc = Document()
        doc.add(Field.Text("title", "This is the title"))
        doc.add(Field.UnStored("contents", "...document contents..."))
        writer.addDocument(doc)

        writer.addDocument(doc, analyzer)

        expression = "some query"

        query = QueryParser.parse(expression, "contents", analyzer)

        parser = QueryParser("contents", analyzer)
        query = parser.parseQuery(expression)
Esempio n. 30
0
    def index(self, doc, title, department, url):
        indexdir = SimpleFSDirectory(File(self.indir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        index_writer = IndexWriter(indexdir, analyzer, self.init,
                                   IndexWriter.MaxFieldLength(512))
        self.init = False

        # Initialize document and index it
        document = Document()
        document.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED))
        document.add(
            Field("department", department, Field.Store.YES,
                  Field.Index.ANALYZED))
        document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED))
        index_writer.addDocument(document)

        index_writer.optimize()
        index_writer.close()