def getIndexWriter(self): writer = IndexWriter( DbDirectory(self.store.txn, self._db, self.store._blocks._db, self._flags), StandardAnalyzer(), False) writer.setUseCompoundFile(False) return writer
def open(self, name, txn, **kwds): super(IndexContainer, self).open(name, txn, **kwds) if kwds.get('create', False): directory = DbDirectory(txn, self._db, self.store._blocks._db, self._flags) indexWriter = IndexWriter(directory, StandardAnalyzer(), True) indexWriter.close()
def __init__(self, directory, shaManager, app='futil'): self._directory = directory create = not IndexReader.indexExists(self._directory) self._writer = IndexWriter(self._directory, StandardAnalyzer(), create) self.shaBBDD = shaManager self.logger = FutilLogger(app) self.uriLoader = UriLoader(logger=self.logger) self.resetCounter()
class IndexAppService(Indexer): def __init__(self, directory, shaManager, app='futil'): self._directory = directory create = not IndexReader.indexExists(self._directory) self._writer = IndexWriter(self._directory, StandardAnalyzer(), create) self.shaBBDD = shaManager self.logger = FutilLogger(app) self.uriLoader = UriLoader(logger=self.logger) self.resetCounter() def resetCounter(self): self.counter = 1000 def countInsertion(self): self.counter -= 1 if self.counter == 0: self.resetCounter() self._writer.optimize() def indexFOAF(self, foaf): document = FoafDocumentFactory.getDocumentFromFOAF(foaf) self._writer.addDocument(document) self.countInsertion() if ( foaf.has_key('sha')): for sha in foaf['sha']: self.shaBBDD.insertUriSha(foaf['uri'][0], sha) if ( foaf.has_key('friends')): for friendSha, friendUri in filter( lambda x: x[0] != '', foaf['friends']): self.shaBBDD.insertUriSha(friendUri, friendSha) return [u for (v,u) in foaf['friends']] return [] def indexFOAFUri(self, foafUri): try: f = self.uriLoader.getFoafFrom(foafUri) return self.indexFOAF(f) except: self.logger.info("Unknow error indexing " + foafUri) return [] def close(self): if self._writer: self._writer.close() self._writer = None self.shaBBDD.close()
try: doc = Document() doc.add(Field("span_id", span_id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("pmid", pmid, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("text", span_text, Field.Store.YES, Field.Index.TOKENIZED)) addAnnotations(doc, span_id) writer.addDocument(doc) except Exception, e: sys.stderr.write("error: %s pmid: %s span_id: %s\n" % (e, pmid, span_id)) i += 2 if __name__ == '__main__': if len(sys.argv) == 1: print "Usage: python index_spans.py data_norm index_dir annotation_files" else: (data_norm, index_dir, annotation_files) = \ (sys.argv[1], sys.argv[2], sys.argv[3:]) print "Loading annotations ..." load(annotation_files) print "Making the index ..." writer = IndexWriter(index_dir, StandardAnalyzer(), True) writer.setMaxFieldLength(7 * 1000 * 1000 * 10) indexData(data_norm) print "Optimizing index ..." writer.optimize() print "Indexing complete" writer.close()
#import PyLucene from mailbox import UnixMailbox from PyLucene import StandardAnalyzer, FSDirectory, IndexWriter from email_loader import EmailDoc import os,sys,datetime,email,config # determine when (if) the last update was lastUp = datetime.datetime(2000, 1, 1) createNewIndex = True quiet = False if len(sys.argv) > 1 and sys.argv[1] in ('-q', '--quiet'): quiet = True store = FSDirectory.getDirectory( config.DB_PATH, True ) writer = IndexWriter( store, StandardAnalyzer(), True ) """ mailbox = UnixMailbox( open('chipy.mbox') ) while True: msg = mailbox.next() if msg == None: break writer.addDocument( EmailDoc(msg) ) """ source=config.MAILDIR_ROOT_DIR for root, dirs, files in os.walk(source): if not quiet: sys.stdout.write('\nindexing files in %s' % root) sys.stdout.flush()
#!/usr/bin/env python2.4 from mailbox import UnixMailbox from PyLucene import StandardAnalyzer, FSDirectory, IndexWriter from email import EmailDoc store = FSDirectory.getDirectory('chipy-index', True) writer = IndexWriter(store, StandardAnalyzer(), True) mailbox = UnixMailbox(open('chipy.mbox')) while True: msg = mailbox.next() if msg == None: break writer.addDocument(EmailDoc(msg)) writer.close()
#!/usr/bin/env python2.4 from mailbox import UnixMailbox from PyLucene import StandardAnalyzer, FSDirectory, IndexWriter from email import EmailDoc store = FSDirectory.getDirectory( 'chipy-index', True ) writer = IndexWriter( store, StandardAnalyzer(), True ) mailbox = UnixMailbox( open('chipy.mbox') ) while True: msg = mailbox.next() if msg == None: break writer.addDocument( EmailDoc(msg) ) writer.close()
Field.Index.UN_TOKENIZED)) doc.add( Field("pmid", pmid, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add( Field("text", span_text, Field.Store.YES, Field.Index.TOKENIZED)) addAnnotations(doc, span_id) writer.addDocument(doc) except Exception, e: sys.stderr.write("error: %s pmid: %s span_id: %s\n" % (e, pmid, span_id)) i += 2 if __name__ == '__main__': if len(sys.argv) == 1: print "Usage: python index_spans.py data_norm index_dir annotation_files" else: (data_norm, index_dir, annotation_files) = \ (sys.argv[1], sys.argv[2], sys.argv[3:]) print "Loading annotations ..." load(annotation_files) print "Making the index ..." writer = IndexWriter(index_dir, StandardAnalyzer(), True) writer.setMaxFieldLength(7 * 1000 * 1000 * 10) indexData(data_norm) print "Optimizing index ..." writer.optimize() print "Indexing complete" writer.close()