Example #1
0
def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
Example #2
0
	def index( self ):
		lucene.initVM()
		indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		analyzer = StandardAnalyzer( Version.LUCENE_30 )
		index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
		# read input files (.xml)
		for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
			corpus = codecs.open( in_file, encoding='utf-8' ).read()
			d = pq( corpus, parser='html' )
			for text in d( 'Article' ).items():
				document = Document()
				# find ID
				art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
				# find Title
				art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
				# find Abstract
				art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
				# find Keyword
				art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
				# find Content
				art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
				# find Authors
				art_authors = text.find( 'Authors' ).html().encode('utf-8')
				document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
									 Field.Store.YES,\
									 Field.Index.ANALYZED ) )
				index_writer.addDocument( document )
			index_writer.optimize()
			index_writer.close()
def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
 except lucene.JavaError:
  #print 'Inside Index Except'
  writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
 #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)
 #print 'In the index function'
 #print writer.numDocs()

#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 #print 'ending Indexing'
 #print string 
 #print 'Total indexes'
 #print writer.numDocs() 
 writer.close()
    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return
            
        docsInIndex  = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(tempfile.gettempdir(),
                                'index-dir')
        dir = FSDirectory.open(indexDir,)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = tempfile.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(Field("fieldname", "Bibamus",
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)
Example #5
0
def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()
Example #6
0
    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return

        docsInIndex = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        dir = FSDirectory.getDirectory(indexDir, True)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = System.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(
                Field("fieldname", "Bibamus", Field.Store.YES,
                      Field.Index.TOKENIZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        # Elwood
        document = Document()
        document.add(
            Field("owner", "elwood", Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "elwoods sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        # Jake
        document = Document()
        document.add(
            Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "jakes sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        writer.close()
Example #8
0
    def index(source, indexName):

        if(not os.path.exists(indexName)):
            os.mkdir(indexName)
            
        indexDir = File(indexName)

        writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED)
      
        p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL)
        res = p.findall(source)
        
        i = 0
        for pair in res:
            i += 1
            doc = Document()
            doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO))
            for t in pair[1].split():
                doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED));
                #doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED));
                
            writer.addDocument(doc)
            
        writer.close()
        print str(i)+ " docs indexed"
Example #9
0
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
 def index(self,path_to_index,path_files):
     'indexes anchor texts from a given folder'
     #lucene.initVM()
     indexDir = path_to_index
     directory_index = SimpleFSDirectory(File(indexDir))
     analyzer = StandardAnalyzer(Version.LUCENE_35)
     writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
     listOfPathes = []
     listOfPathes.extend(glob.glob(path_files+"*.txt"))
     counter = 0
     for path_to_file in listOfPathes:
         print path_to_file
         f = open(path_to_file,"r")
         for line in f:
             entry = line.split("\t")
             counter+=1
             """
             optimizes index after a certain amount of added documents
             """
             if counter%500000==0:
                 print counter
                 writer.optimize()
             doc = Document()
             doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
             writer.addDocument(doc)
         writer.optimize()
      
         f.close()
         
     writer.close()
     print counter
     print "done"
Example #12
0
    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(
                Field("keyword", word, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO))
            doc.add(
                Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Example #13
0
class BuildIndex:
    def __init__(self, indir):
        lucene.initVM()
        indexdir = SimpleFSDirectory(File(indir))
        self.index_writer = IndexWriter(indexdir, self.getAnalyzer(), True,
                                        IndexWriter.MaxFieldLength(512))

    def getAnalyzer(self):
        return ChineseAnalyzer(lucene.Version.LUCENE_CURRENT)

    def addDocuments(self, _id, title, content):
        doc = Document()
        doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if title is not None and len(title) > 0:
            doc.add(
                Field("titleKeyword", title, Field.Store.NO,
                      Field.Index.ANALYZED))
        if content is not None and len(content) > 0:
            doc.add(
                Field("contentKeyword", content, Field.Store.NO,
                      Field.Index.ANALYZED))
        self.index_writer.addDocument(doc)

    def close(self):
        self.index_writer.optimize()
        self.index_writer.close()
Example #14
0
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
Example #15
0
    def setUp(self):
        
        animals = [ "aardvark", "beaver", "coati",
                    "dog", "elephant", "frog", "gila monster",
                    "horse", "iguana", "javelina", "kangaroo",
                    "lemur", "moose", "nematode", "orca",
                    "python", "quokka", "rat", "scorpion",
                    "tarantula", "uromastyx", "vicuna",
                    "walrus", "xiphias", "yak", "zebra" ]

        analyzer = WhitespaceAnalyzer()

        aTOmDirectory = RAMDirectory()
        nTOzDirectory = RAMDirectory()

        aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
        nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)

        for animal in animals:
            doc = Document()
            doc.add(Field("animal", animal,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))

            if animal[0].lower() < "n":
                aTOmWriter.addDocument(doc)
            else:
                nTOzWriter.addDocument(doc)

        aTOmWriter.close()
        nTOzWriter.close()

        self.searchers = [ IndexSearcher(aTOmDirectory),
                           IndexSearcher(nTOzDirectory) ]
    def addDocuments(self, dir, isCompound):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(isCompound)

        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs

        for word in self.docs:
            doc = Document()
            doc.add(Field("keyword", word,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("unstored", word,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
    def update_index_withLineArray(self,array):
        """
        Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI
        """
        print "start adding sentences"
        writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512))
        for item in array:
            line = item[0]
            x = item[1]
            y = item[2]
            uri = item[3]
            line=line.replace("\t"," ")
            line = line.replace("\n","  ")
            line = line.replace("   ","  ")
            try:
                
                doc = Document()
                doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED))
                writer.addDocument(doc)
                
            except Exception:
                print "Unexpected error:", sys.exc_info()[0]
                raw_input("Error in updating the Sentences")
        try:
            writer.optimize()
        except:
            print "Unexpected error:", sys.exc_info()[0]
            print ("could not optimize index")

        writer.close()
        print "all sentences added"
Example #18
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Example #19
0
def do_index():
    initVM()
    indexDir = "/home/william/woyaoo/luceneindex"
    version = Version.LUCENE_CURRENT
    standardAnalyzer = StandardAnalyzer(version)
    # chineseAnalyzer = CJKAnalyzer(version)
    engine = data.engine_from_config("indexdb.config")
    # engine = data.engine_from_config()
    db = data.init_datafactory(engine)
    docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
    print len(docs)
    idxDir = SimpleFSDirectory(File(indexDir))
    perIndexCount = 5000
    writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))

    # add field
    for doc in docs:
        # print repr(doc.description)
        lucenedoc = Document()
        descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
        # descriptionValue ='中国 abc'
        print repr(descriptionValue)
        lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
        lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
        # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(lucenedoc)
        writer.optimize()
    writer.close()
    print "index finished"
Example #20
0
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
Example #21
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Example #22
0
 def addContents(self,contents):
      try:
           #iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED)
           writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED)
           for content in contents:
               doc = Document()
               doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES))
               writer.addDocument(doc)
           writer.close()
      except Exception,e:
           print 'Unable to add content to RAM index'        
Example #23
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.porterAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("contents",
                      "The quick brown fox jumps over the lazy dogs",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()
Example #24
0
    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Example #25
0
    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.commit()
        writer.close()
Example #26
0
    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("field", "the quick brown fox jumped over the lazy dog",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)
Example #27
0
    def setUp(self):

        # set up sample document
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(directory)
Example #28
0
def luceneIndexer(docdir,indir):
	""" IndexDocuments from a directory.
	Args:
		docdir:文档所在文件夹
		indir:索引存放文件夹
	Returns:
		无返回值
	说明:
	FieldType().setStored=as-is value stored in the Lucene index
	FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
	FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
	FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
	"""
	
	"""#类型1属性:对于需要检索,需要返回显示setStored(True)
	type1 = FieldType()
	type1.setIndexed(True)
	type1.setStored(True)
	type1.setTokenized(False)
	type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
	#类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。
	type2 = FieldType()
	type2.setIndexed(True)
	type2.setStored(False)
	type2.setTokenized(True)
	type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
	
	lucene.initVM()
	DIRTOINDEX= docdir
	INDEXIDR= indir
	indexdir= SimpleFSDirectory(File(INDEXIDR))
	analyzer= StandardAnalyzer(Version.LUCENE_30)
	#用指定的语言分析器构造一个新的写索引器.
	index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
	    #print "Indexing: "
		print "Indexing:", tfile;
		document = Document()
		content = open(tfile,'r').read()
		#类型使用方式
		#doc.add(Field("path", tfile, type1))
		
		#文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"}
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done: ", tfile
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()
    def setUp(self):

        self.analyzer = WhitespaceAnalyzer()
        self.directory = RAMDirectory()

        writer = IndexWriter(self.directory, self.analyzer, True, 
                             IndexWriter.MaxFieldLength.LIMITED)

        for i in xrange(1, 501):
            doc = Document()
            doc.add(Field("id", NumberUtils.pad(i),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.close()
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("partnum", "Q36",
                      Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("description", "Illidium Space Modulator",
                      Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()
Example #31
0
File: plush.py Project: jjguy/plush
 def initDummyStore(self, directory):
     """Open a dummy ramdirectory for testing."""
     writer = IndexWriter(directory, SimpleAnalyzer(), True)
     doc = Document()
     doc.add(Field("name", 'dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("contents", "foo dummy bar", Field.Store.YES,
                   Field.Index.TOKENIZED))
     writer.addDocument(doc)
     writer.optimize()
     writer.close()
 def create_index(self,path_to_index):
     """
     Creates new Index
     """
     print "Create new Index"
     path = SimpleFSDirectory(File(path_to_index))
     analyzer = StandardAnalyzer(Version.LUCENE_35)
     writer = IndexWriter(path, analyzer, True, IndexWriter.MaxFieldLength(512))
     doc = Document()
     doc.add(Field("Sentence", "Hello World", Field.Store.YES, Field.Index.ANALYZED))
     doc.add(Field("X", "x", Field.Store.YES, Field.Index.ANALYZED))
     doc.add(Field("Y", "y", Field.Store.YES, Field.Index.ANALYZED))
     doc.add(Field("URI", "uri", Field.Store.YES, Field.Index.ANALYZED))
     writer.addDocument(doc)
     writer.close() 
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("description", "Illidium Space Modulator", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
Example #34
0
def Indexer(docdir,indir):
	lucene.initVM()
	DIRTOINDEX   = docdir
	INDEXDIR     = indir
	indexdir     = FSDirectory(File(INDEXDIR))
	analyzer     = StandardAnalyzer(VERSION.LUCENE_30)
	index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
		print "Indexing ",tfile
		document=Document()
		content = open(tfile,'r').read()
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done"
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()
Example #35
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()
Example #36
0
    def index(self):

        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                               "verbose-index")
        dir = FSDirectory.getDirectory(dirPath, True)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(System.out)

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Example #37
0
def lucene_index(input_folder,output_folder):
    '''
    Indexes fresh text data using lucene 3.6.
    Doesn't support incremental generation of index as of now.
    Currently crashes on neo by running out of heap space.
    Arguments: Input folder for text files. output folder for index location 
    Returns: void. The index is stored if generated.
    
    
    '''
    
    # Setting up log file
    logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
    logging.info("Input directory for logging: "+input_folder)
    logging.info("Output directory of index: "+output_folder)
    if  not os.path.isdir(output_folder):
        logger.debug("Making output directory for index: "+ output_folder)
        os.makedirs(output_folder)
    
    # Setting up lucene's heap size for index and version of indexer
    lucene.initVM(initialheap='1024m',maxheap='2048m')
    index_folder = SimpleFSDirectory(File(output_folder))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
    
    # Optimization to reduce heap space usage for generation of index. Merges buffer with
    # current index after 15 docs.
    writer.setMergeFactor(15) 
    writer.setRAMBufferSizeMB(32.0)
    
    # Search to find the files to index
    files_to_index = find_files_in_folder(input_folder) 
    for input_file in files_to_index:
        doc = Document()
        content = open(input_file, 'r').read()
        doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
        doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
        writer.addDocument(doc) # Index

    logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
    logger.info( "About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize() # Compress index
    logger.info("...done optimizing index of %d documents" % writer.numDocs())
    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()
    logger.info("Closed index")
    def index(self):

        dirPath = os.path.join(tempfile.gettempdir(),
                               "verbose-index")
        dir = FSDirectory.open(dirPath)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(InfoStreamOut())

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(Field("field", "the quick brown fox jumped over the lazy dog",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(Field("field", "the fast fox hopped over the hound",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)
Example #40
0
    def main(cls, argv):
        
        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return
        
        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(Field("word", word,
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("t9", cls.t9(word),
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("length", str(len(word)),
                             Field.Store.NO, Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.commit()
        writer.close()

        reader.close()
Example #41
0
def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

 print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)

 print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
 print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
 print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
 print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 writer.close()
Example #42
0
def luceneIndexer(docdir,indir):

	"""

	Index Documents from a dirrcory

	"""

	lucene.initVM()

	DIRTOINDEX = docdir

	INDEXIDR = indir

	indexdir = SimpleFSDirectory(File(INDEXIDR))

	analyzer = StandardAnalyzer(Version.LUCENE_30)

	index_writer = IndexWriter(indexdir,analyzer,True,\

	IndexWriter.MaxFieldLength(512))

	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):

		print "Indexing: ", tfile

		document = Document()

		content = open(tfile,'r').read()

		document.add(Field("text",content,Field.Store.YES,\

		Field.Index.ANALYZED))

		index_writer.addDocument(document)

		print "Done: ", tfile

	index_writer.optimize()

	print index_writer.numDocs()

	index_writer.close()
Example #43
0
def luceneIndexer(docdir, indir):
    """

         IndexDocuments from a directory

         """

    lucene.initVM()

    DIRTOINDEX = docdir

    INDEXIDR = indir

    indexdir = SimpleFSDirectory(File(INDEXIDR))

    analyzer = StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))

    for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):

        print "Indexing: ", tfile

        document = Document()

        content = open(tfile, 'r').read()

        document.add(Field("text",content,Field.Store.YES,\

                 Field.Index.ANALYZED))

        index_writer.addDocument(document)

        print "Done: ", tfile

    index_writer.optimize()

    print index_writer.numDocs()

    index_writer.close()
Example #44
0
    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)
 def update_index_withLine(self,line,x,y,uri):
     """
     Parsed sentence is added to the index, with the corresponding two entities (x,y) and the DBpedia URI
     """
     line=line.replace("\t"," ")
     line = line.replace("\n","  ")
     line = line.replace("   ","  ")
     try:
         writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512))
         doc = Document()
         doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED))
         doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED))
         doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED))
         doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED))
         writer.addDocument(doc)
         writer.optimize()
         writer.close() 
     except Exception:
         print "Unexpected error:", sys.exc_info()[0]
         raw_input("Error in updating the Sentences")
Example #46
0
    def addDocuments(self, dir, maxFieldLength):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength(maxFieldLength))
        
        for keyword, unindexed, unstored, text in \
                izip(self.keywords, self.unindexed, self.unstored, self.text):
            doc = Document()
            doc.add(Field("id", keyword,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", unindexed,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", unstored,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", text,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Example #47
0
    def index(self, doc, title, department, url):
        indexdir = SimpleFSDirectory(File(self.indir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        index_writer = IndexWriter(indexdir, analyzer, self.init,
                                   IndexWriter.MaxFieldLength(512))
        self.init = False

        # Initialize document and index it
        document = Document()
        document.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED))
        document.add(
            Field("department", department, Field.Store.YES,
                  Field.Index.ANALYZED))
        document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED))
        index_writer.addDocument(document)

        index_writer.optimize()
        index_writer.close()
Example #48
0
    def someMethod(self):

        directory = RAMDirectory()

        analyzer = StandardAnalyzer()
        writer = IndexWriter(directory, analyzer, True)

        doc = Document()
        doc.add(Field.Text("title", "This is the title"))
        doc.add(Field.UnStored("contents", "...document contents..."))
        writer.addDocument(doc)

        writer.addDocument(doc, analyzer)

        expression = "some query"

        query = QueryParser.parse(expression, "contents", analyzer)

        parser = QueryParser("contents", analyzer)
        query = parser.parseQuery(expression)
Example #49
0
    def setUp(self):

        animals = [
            "aardvark", "beaver", "coati", "dog", "elephant", "frog",
            "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur",
            "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion",
            "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak",
            "zebra"
        ]

        analyzer = WhitespaceAnalyzer()

        aTOmDirectory = RAMDirectory()
        nTOzDirectory = RAMDirectory()

        aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
        nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)

        for animal in animals:
            doc = Document()
            doc.add(
                Field("animal", animal, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))

            if animal[0].lower() < "n":
                aTOmWriter.addDocument(doc)
            else:
                nTOzWriter.addDocument(doc)

        aTOmWriter.close()
        nTOzWriter.close()

        self.searchers = [
            IndexSearcher(aTOmDirectory),
            IndexSearcher(nTOzDirectory)
        ]
    def addDocuments(self, dir):

        writer = IndexWriter(dir, self.getAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(self.isCompound())

        for i in xrange(len(self.keywords)):
            doc = Document()
            doc.add(
                Field("id", self.keywords[i], Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("country", self.unindexed[i], Field.Store.YES,
                      Field.Index.NO))
            doc.add(
                Field("contents", self.unstored[i], Field.Store.NO,
                      Field.Index.ANALYZED))
            doc.add(
                Field("city", self.text[i], Field.Store.YES,
                      Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Example #51
0
def luceneIndexer(docdir, indir):
    """frpFile
    IndexDocuments from a directory

    para:{
        docdir: the path of the txt file
        indir: the path of the index file which is generated by the following code
        }
    """

    lucene.initVM()
    DIRTOINDEX = docdir
    INDEXIDR = indir
    indexdir = SimpleFSDirectory(File(INDEXIDR))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    index_writer = IndexWriter(indexdir, analyzer, True, \
                               IndexWriter.MaxFieldLength(512))
    #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):
    list = os.listdir(DIRTOINDEX)
    for i in range(len(list)):
        tfile = os.path.join(DIRTOINDEX, list[i])
        if os.path.isfile(tfile):
            print ("Indexing: ", tfile)
            print ('okokokook')
            document = Document()
            content = open(tfile, 'r').read()
            document.add(Field("text", content, Field.Store.YES, \
                               Field.Index.ANALYZED))
            document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\
                               Field.Index.ANALYZED))
            index_writer.addDocument(document)
            #print (document)
            print ("Done: ", tfile)
    index_writer.optimize()
    print (index_writer.numDocs())
    index_writer.close()
Example #52
0
class StatisticsFromCats:
    def __init__(self, pages_xml, cats_xml):
        self._pages = pages_xml
        self._cats = cats_xml
        self._out_xml = None
        self._unique = 0
        self._total_cats = 0
        self._total = 0
        self._total_art = 0
        self._titles = []
        self._analyzer = None
        self._dir = None
        self._writer = None
        self._th=None
        self._dick=None

    # openning our parsed data
    def do_job(self):
         with open('temp_output', 'w') as output:
             print('<?xml version="1.0" ?>\n<root>', file=output)
             with open(self._pages) as page:
                 tree_articles = etree.iterparse(page, tag='data', remove_comments=True, remove_blank_text=True)
                 with open(self._cats) as cats:
                     print('working on it')


                     tree_link = etree.iterparse(cats, tag='category_trunc', remove_blank_text=True)
                     dict_link = {}
                     self._dick = {}
                     full_name_dict_link = {}
                     fast_iter(tree_link, self.gethash, dict_link, full_name_dict_link)
                     self.indexFile()
                     fast_iter(tree_articles, self.findcats, output, dict_link, full_name_dict_link, self._dick)
                     print("<total>" + str(self._total_cats) + "</total>", file=output)
                     print("<unique>" + str(self._unique) + "</unique>", file=output)
                     print("<avg>" + str(self._unique / self._total_art) + "</avg>", file=output)
                     print("<onetimeunique>" + str(len(self._dick)) + "</onetimeunique>", file=output)

                     print("</root>", file=output)
                     self._writer.close()

         # with open('temp_output','r') as output:
         #    output = etree.iterparse(output,tag='data',remove_comments=True)
         #    fast_iter(output,self.createIndexing)


    #getting total of cats
    def get_total(self):
        return str(self._total_cats)

    #geting number of unique
    def get_unique(self):
        return  str(self._unique)

     #geting number of unique
    def get_avg(self):
        return str(self._unique / self._total_art)

     #geting number of unique
    def get_one_time_unique(self):
        return str(len(self._dick))


    #search function
    def find_spec_cat(self, title):
        title = title.replace('\n', '')
        return self.query(title)

    #finding categories in ssql based on id form xml
    def findcats(self, elem, output, dict, dict_all, dict_unique):
        id_atr = elem.get('article_id')
        self._total_art += 1
        unique = 0
        content='XML\n'
        try:
            found = dict[id_atr]
        except KeyError:
            found = []
        self._total += 1
        data = etree.Element('data')
        data.set('article_id', id_atr)
        data.set('title', elem.findtext('title'))
        self._titles.append(elem.findtext('title'))
        for catinarticle in elem.findall('category'):
            self._total_cats += 1
            if catinarticle is not None and catinarticle.text is not None:
                try:
                    content+=catinarticle.text+"\n"
                    trimCat = catinarticle.text.replace(' ', '').replace('\'', '')
                    if trimCat not in found:
                        unique += 1
                        dict_unique[trimCat] = 1
                except etree.XPathEvalError:
                    print(catinarticle.text.replace(' ', '').replace('\'', ''))
            sub = etree.SubElement(data, 'category')
            sub.text = catinarticle.text

        content+='\nSQL\n'
        if found:
            for text in dict_all[id_atr]:
                subdat = etree.SubElement(data, 'categoryDat')
                subdat.text = text
                content+=text+"\n"
            del dict[id_atr]
            del dict_all[id_atr]
        print(etree.tostring(data, encoding='utf-8'), file=output)
        self._unique += unique
        self.addElement(elem.findtext('title'),content)


    def gethash(self, elem, dict, full_name_dict_link):
        id_atr = elem.get('article_id')
        try:
            ret = dict[id_atr]
            retfull = dict[id_atr]
        except KeyError:
            ret = []
            retfull = []
        ret.append(elem.text)
        dict[id_atr] = ret
        full_name_dict_link[id_atr] = retfull
        self._total += 1

    def createIndexing(self,elem):
        title=elem.get('title')
        content='XML\n'
        for catinarticle in elem.findall('category'):
            if catinarticle.text is not None:
                content+=catinarticle.text+'\n'
        content+='\nSQL\n'
        for catinsql in elem.findall('categoryDat'):
            if catinsql.text is not None:
                content+=catinsql.text+'\n'
        self.addElement(title,content)

    def indexFile(self):
        self._th=lucene.initVM()
        self._analyzer = StandardAnalyzer(Version.LUCENE_36)
        self._dir = RAMDirectory()
        self._writer = IndexWriter(self._dir, self._analyzer, True, IndexWriter.MaxFieldLength(25000))



    def addElement(self,title,content):
        self._total_art += 1
        doc = Document()
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED))
        self._writer.addDocument(doc)



    def query(self,title):
        self._th.attachCurrentThread()
        searcher = IndexSearcher(self._dir)
        query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title)
        total_hits = searcher.search(query, 10)
        for hit in total_hits.scoreDocs:
            doc = (searcher.doc(hit.doc))
            return doc.get("title")+"\n"+doc.get("content")+"--------------------------------"
        return "None"
Example #53
0
    def index(cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        ver = lucene.Version.LUCENE_35
        config = IndexWriterConfig(ver, WhitespaceAnalyzer(ver))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir,
                                       IndexWriterConfig.OpenMode.CREATE)
        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [createCategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List.
            # see http://docs.oracle.com/javase/1.5.0/docs/api/java/util/Arrays.html#asList(T...)
            facetList = lucene.Arrays.asList(facetList)
            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # we do not alter indexing parameters
            # a category document builder will add the categories to a document once build() is called
            categoryDocBuilder = CategoryDocumentBuilder(
                taxo).setCategoryPaths(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(
                Field(TITLE, docTitles[docNum], Field.Store.YES,
                      Field.Index.ANALYZED))
            doc.add(
                Field(TEXT, docTexts[docNum], Field.Store.NO,
                      Field.Index.ANALYZED))

            # invoke the category document builder for adding categories to the document and,
            # as required, to the taxonomy index
            categoryDocBuilder.build(doc)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded += 1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,
                                                                nFacetsAdded)
    #    print "Max merge docs:", writer.getMaxMergeDocs()
    #    print "Max buffered docs:", writer.getMaxBufferedDocs()

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    i = 0
    print >> sys.stderr, "Reading lines from sys.stdin..."
    for l in sys.stdin:
        i += 1

        if string.strip(l) == "": continue

        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        if i % 10000 == 0:
            print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (
                i, writer.numDocs())
            print >> sys.stderr, stats()
#        if i > 100000: break

    print >> sys.stderr, "Indexed a total of %d lines from stdin (%d documents in index)" % (
        i, writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    print >> sys.stderr, stats()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
Example #55
0
    def main(cls, argv):

        if len(argv) < 2:
            print "Usage: BerkeleyDbIndexer <index dir> -create"
            return

        dbHome = argv[1]
        create = len(argv) > 2 and argv[2] == "-create"

        if not os.path.exists(dbHome):
            os.makedirs(dbHome)
        elif create:
            for name in os.listdir(dbHome):
                if name.startswith('__'):
                    os.remove(os.path.join(dbHome, name))

        env = DBEnv()
        env.set_flags(DB_LOG_INMEMORY, 1)
        if os.name == 'nt':
            env.set_cachesize(0, 0x4000000, 1)
        elif os.name == 'posix':
            from commands import getstatusoutput
            if getstatusoutput('uname') == (0, 'Linux'):
                env.set_cachesize(0, 0x4000000, 1)

        env.open(dbHome, (DB_CREATE | DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK
                          | DB_INIT_TXN), 0)

        index = DB(env)
        blocks = DB(env)
        txn = None

        try:
            txn = env.txn_begin(None)
            index.open(filename='__index__',
                       dbtype=DB_BTREE,
                       flags=DB_CREATE | DB_THREAD,
                       txn=txn)
            blocks.open(filename='__blocks__',
                        dbtype=DB_BTREE,
                        flags=DB_CREATE | DB_THREAD,
                        txn=txn)
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            txn = None

        try:
            txn = env.txn_begin(None)
            directory = DbDirectory(txn, index, blocks, 0)
            writer = IndexWriter(directory, StandardAnalyzer(), create,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
            writer.setUseCompoundFile(False)

            doc = Document()
            doc.add(
                Field("contents", "The quick brown fox...", Field.Store.YES,
                      Field.Index.ANALYZED))
            writer.addDocument(doc)

            writer.optimize()
            writer.close()
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            index.close()
            blocks.close()
            env.close()

        print "Indexing Complete"