def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def reindex_all(reader, writer, analyzer): for i in xrange(reader.maxDoc()): if reader.isDeleted(i): continue doc = reader.document(i) p = doc.get("path") pkid = doc.get('txtorg_id') if p is None: # No filepath specified, just use original document writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer) else: # if a path field is found, try to read the file it points to and add a contents field edited_doc = Document() for f in doc.getFields(): edited_doc.add(Field.cast_(f)) try: inf = open(p) contents = unicode(inf.read(), 'UTF-8') inf.close() if len(contents) > 0: edited_doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)) else: print "warning: no content in %s" % filename except: print "Could not read file; skipping" writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def indexfeeds(self,writer): """ """ feedlist=['http://today.reuters.com/rss/topNews', 'http://today.reuters.com/rss/domesticNews', 'http://today.reuters.com/rss/wordNews', 'http://rss.cnn.com/rss/edition.rss', 'http://rss.cnn.com/rss/edition_word.rss', 'http://rss.cnn.com/rss/edition_us.rss'] articletitles=[] for feed in feedlist: f=feedparser.parse(feed) for e in f.entries: if e.title in articletitles: continue contents = e.title.encode('utf8') + self.strphtml(e.description.encode('utf8')) try: doc = Document() doc.add(Field("name", e.title, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) writer.addDocument(doc) except Exception, e: print 'Unable to index'
def run(self): env.attachCurrentThread() stream = tweetstream.SampleStream("username", "password") for tweet in stream: try: contents = unicode(tweet['text']) user_name = tweet['user']['screen_name'] #print contents #print user_name doc = Document() doc.add(Field("user_name", user_name, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED)) else: pass self.writer.addDocument(doc) # optimize for fast search and commit the changes self.writer.optimize() self.writer.commit() except Exception as e: pass
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt','r',encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url,title,alt] f.close() for src in picDict: doc = Document() doc.add(Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def index(source, indexName): if(not os.path.exists(indexName)): os.mkdir(indexName) indexDir = File(indexName) writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED) p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL) res = p.findall(source) i = 0 for pair in res: i += 1 doc = Document() doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO)) for t in pair[1].split(): doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED)); #doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc) writer.close() print str(i)+ " docs indexed"
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') dir = FSDirectory.getDirectory(indexDir, True) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = System.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add( Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(tempfile.gettempdir(), 'index-dir') dir = FSDirectory.open(indexDir,) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = tempfile.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add(Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def configure_lucene(): f = open('clique.txt','r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t','') line = line.replace('\r','') line = line.replace('\n','') line = line.replace('^','') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add(Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512)) except lucene.JavaError: #print 'Inside Index Except' writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) #e = sys.exc_info()[0] #print e #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print 'In the index function' #print writer.numDocs() #print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) #print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() #print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() #print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() #print 'ending Indexing' #print string #print 'Total indexes' #print writer.numDocs() writer.close()
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def _addDoc(self, text, writer): """ function to add documents in the lucene index. text fields are indexed by the name "field" """ doc = Document() doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def indexDocs(self): #from user_location_map for root, _, filenames in os.walk(self.directory): for filename in filenames: try: path = os.path.join(root, filename) print path f = open(path, 'r') # every line in the file is a user_location_map entry for line in f: data = cjson.decode(line) doc = Document() tw_texts = [] num_tweets = {} total_num_tweets = 0.0 for i in data['locations']: tweets = [x['tx'] for x in i['tweets']] num_tweets[i['name']] = len(i['tweets']) tw_texts.extend(tweets) total_num_tweets += len(tweets) #tweets indexed as part of the UserMapIndexer doc.add(Field("text", " ".join(tw_texts), Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("num_tweets", cjson.encode(num_tweets), Field.Store.YES, Field.Index.NO)) doc.add(Field("user", data['user'], Field.Store.YES, Field.Index.NO)) doc.setBoost(total_num_tweets) self.writer.addDocument(doc) f.close() except Exception, e: print "Failed in indexLocations:", e print sys.exc_info()[0]
def do_index(): initVM() indexDir = "/home/william/woyaoo/luceneindex" version = Version.LUCENE_CURRENT standardAnalyzer = StandardAnalyzer(version) # chineseAnalyzer = CJKAnalyzer(version) engine = data.engine_from_config("indexdb.config") # engine = data.engine_from_config() db = data.init_datafactory(engine) docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all() print len(docs) idxDir = SimpleFSDirectory(File(indexDir)) perIndexCount = 5000 writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512)) # add field for doc in docs: # print repr(doc.description) lucenedoc = Document() descriptionValue = doc.description.strip("\r\n").encode("UTF-8") # descriptionValue ='中国 abc' print repr(descriptionValue) lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED)) # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(lucenedoc) writer.optimize() writer.close() print "index finished"
def addDocuments(self, dir, isCompound): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(isCompound) # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs for word in self.docs: doc = Document() doc.add(Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add(Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(self,path_to_index,path_files): 'indexes anchor texts from a given folder' #lucene.initVM() indexDir = path_to_index directory_index = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512)) listOfPathes = [] listOfPathes.extend(glob.glob(path_files+"*.txt")) counter = 0 for path_to_file in listOfPathes: print path_to_file f = open(path_to_file,"r") for line in f: entry = line.split("\t") counter+=1 """ optimizes index after a certain amount of added documents """ if counter%500000==0: print counter writer.optimize() doc = Document() doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() f.close() writer.close() print counter print "done"
def addContents(self,contents): try: #iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED) writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED) for content in contents: doc = Document() doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES)) writer.addDocument(doc) writer.close() except Exception,e: print 'Unable to add content to RAM index'
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.porterAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("contents", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.optimize() writer.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.commit() writer.close()
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def luceneIndexer(docdir,indir): """ IndexDocuments from a directory. Args: docdir:文档所在文件夹 indir:索引存放文件夹 Returns: 无返回值 说明: FieldType().setStored=as-is value stored in the Lucene index FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted) FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion. """ """#类型1属性:对于需要检索,需要返回显示setStored(True) type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。 type2 = FieldType() type2.setIndexed(True) type2.setStored(False) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)""" lucene.initVM() DIRTOINDEX= docdir INDEXIDR= indir indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) #用指定的语言分析器构造一个新的写索引器. index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): #print "Indexing: " print "Indexing:", tfile; document = Document() content = open(tfile,'r').read() #类型使用方式 #doc.add(Field("path", tfile, type1)) #文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"} document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) reader = StringReader(reader.read()) doc.add(Field("contents", reader, Field.TermVector.YES)) indexWriter.addDocument(doc)
def indexFile(self, writer, path): try: reader = InputStreamReader(FileInputStream(path), 'iso-8859-1') except JavaError: raise else: doc = Document() doc.add(Field("contents", reader)) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) reader.close() return doc
def setUp(self): self.analyzer = WhitespaceAnalyzer() self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for i in xrange(1, 501): doc = Document() doc.add(Field("id", NumberUtils.pad(i), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = self.getSearcher()
def indexDocs(self): for root, _, filenames in os.walk(self.directory): for filename in filenames: try: path = os.path.join(root, filename) print path f = open(path, 'r') # every line in the file is a user_location_map document to be indexed for line in f: data = cjson.decode(line) doc = Document() locations = [x['name'] for x in data['locations']] num_tweets = {} tweets = {} tw_texts = [] for i in data['locations']: tweets[i['name']] = [x['tx'] for x in i['tweets']] """ Instead of storing into lucene we need to write the tweets to a db. We do that here. Thats our supplementary index. try: UserMapIndexer.DB['user_location_tweets'].insert({'sn': data['user'].strip('@'), 'l': i['name'], 't': tweets[i['name']]}) except Exception, e: print "Failed while adding to DB:", len(tweets[i['name']]) """ tw_texts.extend(tweets[i['name']]) num_tweets[i['name']] = len(i['tweets']) """ doc.add(Field("loc", " ".join(locations), Field.Store.NO, Field.Index.ANALYZED)) """ doc.add(Field("loc", "_".join(locations), Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", " ".join(tw_texts), Field.Store.NO, Field.Index.ANALYZED)) """ doc.add(Field("tweets", cjson.encode(tweets), Field.Store.YES, Field.Index.NO)) """ doc.add(Field("num_tweets", cjson.encode(num_tweets), Field.Store.YES, Field.Index.NO)) doc.add(Field("user", data['user'], Field.Store.YES, Field.Index.NO)) self.writer.addDocument(doc) f.close() except Exception, e: print "Failed in indexDocs:", e
def indexFile(self, writer, path): try: file = open(path) string = HTMLReader(InputStreamReader(file, 'utf-8')).read() file.close() except: raise else: doc = Document() doc.add(Field("contents", StringReader(string))) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) return doc
def write_index(self, workflow, property = False): """ adds all keywords in workflow to the index at the specified location types of the keywords can be preserved by setting (Property = True) """ self.ddict = dict() # name is used as id in this case self.adddd("workflow_id", str(workflow.id)) self.adddd("text", workflow.id) self.adddd("text", workflow.name) # this is a key for the workflow #adddd("workflow_source", workflow.source) #self.adddd("text", workflow.source) #adddd("workflow_type", workflow.type) self.adddd("text", workflow.type) # not very interesting #d.add( Field("workflow_version", workflow.version, save, Field.Index.UN_TOKENIZED)) self.indexAnnotations(workflow.annotations, property) for module in workflow.modules: self.adddd("module_name" if property else "text", module.name) self.adddd("package" if property else "text", module.package) # not very interesting #d.add( Field("package_version", module.version, save, Field.Index.UN_TOKENIZED)) self.adddd("module_type" if property else "text", module.type) self.indexAnnotations(module.annotations, property) for p in module.parameters: self.adddd("parameter_name" if property else "text", p.name) self.adddd("parameter_value" if property else "text", p.value) self.adddd("parameter_type" if property else "text", p.type) self.indexAnnotations(p.annotations, property) for c in workflow.connections: self.adddd("port_name" if property else "text", c.startPort) self.adddd("port_name" if property else "text", c.endPort) self.indexAnnotations(c.annotations, property) d = Document() for (k, v) in self.ddict.iteritems(): d.add(Field(k, v, self.save, Field.Index.TOKENIZED)) # Delete old versions WorkflowIndexer.writer.deleteDocuments( [Term('workflow_id', str(workflow.id))] ) # add new WorkflowIndexer.writer.addDocument(d)
def Indexer(docdir,indir): lucene.initVM() DIRTOINDEX = docdir INDEXDIR = indir indexdir = FSDirectory(File(INDEXDIR)) analyzer = StandardAnalyzer(VERSION.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing ",tfile document=Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done" index_writer.optimize() print index_writer.numDocs() index_writer.close()
def index(self): dirPath = os.path.join(tempfile.gettempdir(), "verbose-index") dir = FSDirectory.open(dirPath) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(InfoStreamOut()) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def lucene_index(input_folder,output_folder): ''' Indexes fresh text data using lucene 3.6. Doesn't support incremental generation of index as of now. Currently crashes on neo by running out of heap space. Arguments: Input folder for text files. output folder for index location Returns: void. The index is stored if generated. ''' # Setting up log file logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log")) logging.info("Input directory for logging: "+input_folder) logging.info("Output directory of index: "+output_folder) if not os.path.isdir(output_folder): logger.debug("Making output directory for index: "+ output_folder) os.makedirs(output_folder) # Setting up lucene's heap size for index and version of indexer lucene.initVM(initialheap='1024m',maxheap='2048m') index_folder = SimpleFSDirectory(File(output_folder)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) # Optimization to reduce heap space usage for generation of index. Merges buffer with # current index after 15 docs. writer.setMergeFactor(15) writer.setRAMBufferSizeMB(32.0) # Search to find the files to index files_to_index = find_files_in_folder(input_folder) for input_file in files_to_index: doc = Document() content = open(input_file, 'r').read() doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index. doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file writer.addDocument(doc) # Index logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs())) logger.info( "About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() # Compress index logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() logger.info("Closed index")
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_STORED = Field.Store.NO TOKENIZED = Field.Index.TOKENIZED UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) doc.add( Field("contents", value, UN_STORED, TOKENIZED, Field.TermVector.YES)) indexWriter.addDocument(doc)
def update_index_withLineArray(self,array): """ Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI """ print "start adding sentences" writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512)) for item in array: line = item[0] x = item[1] y = item[2] uri = item[3] line=line.replace("\t"," ") line = line.replace("\n"," ") line = line.replace(" "," ") try: doc = Document() doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) except Exception: print "Unexpected error:", sys.exc_info()[0] raw_input("Error in updating the Sentences") try: writer.optimize() except: print "Unexpected error:", sys.exc_info()[0] print ("could not optimize index") writer.close() print "all sentences added"
def run(self): env.attachCurrentThread() #for self.folder, dirnames, filenames in os.walk(self.folder): #for filename in filenames: #print filename #path = os.path.join(self.folder,filename) filename = 'tweets.gz' for crowdInstance in iterateCrowdInstances(filename): print crowdInstance contents = unicode(crowdInstance['text']) crowd_id = crowdInstance['crowd_id'] doc = Document() doc.add(Field("name", filename, Field.Store.YES, Field.Index.NOT_ANALYZED)) #doc.add(Field("path", path, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("crowd_id", crowd_id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED)) else: pass self.writer.addDocument(doc) # optimize for fast search and commit the changes self.writer.optimize() self.writer.commit()
def indexFile(self, writer, path): try: file = mod_gzip.GzipFile(filename = path,mode = 'rb', compresslevel = 2) url,title,HTMLcontents = RepoReader(InputStreamReader(file, 'utf-8')).read() file.close() except: raise else: #try: # title = getTitle(HTMLcontents) #except: # title ="" #if (title == None): # title ="" doc = Document() doc.add(Field("contents", HTMLcontents, Field.Store.NO, Field.Index.TOKENIZED)) doc.add(Field("title", title, Field.Store.YES, Field.Index.NO)) doc.add(Field("url", url, Field.Store.YES, Field.Index.NO)) writer.addDocument(doc) return doc
def _IndexField(field_list, content): i = 0 doc = Document() while i < len(field_list): if (field_list[i]['StringField'] is not False): _IndexStringField(doc, field_list[i]['FieldName'], content[i]) elif (field_list[i]['NumericField'] is not False): _IndexNumericField(doc, field_list[i]['FieldName'], content[i]) i += 1 return doc
def addDocuments(self, _id, title, content): doc = Document() doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if title is not None and len(title) > 0: doc.add( Field("titleKeyword", title, Field.Store.NO, Field.Index.ANALYZED)) if content is not None and len(content) > 0: doc.add( Field("contentKeyword", content, Field.Store.NO, Field.Index.ANALYZED)) self.index_writer.addDocument(doc)
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def addPoint(self, writer, name, type, x, y): doc = Document() doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("x", str(x), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) doc.add( Field("y", str(y), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) writer.addDocument(doc)
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()
def addDocuments(self, dir, maxFieldLength): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength(maxFieldLength)) for keyword, unindexed, unstored, text in \ izip(self.keywords, self.unindexed, self.unstored, self.text): doc = Document() doc.add(Field("id", keyword, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", unindexed, Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", unstored, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add( Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
def addCrowd(self, id, text): doc = Document() doc.add( Field(CrowdFields.id, id, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field(CrowdFields.text, text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.updateDocument(Term(CrowdFields.id, id), doc)
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add( Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def parse_file(file_path, writer): f = open(file_path, 'r') soup = BeautifulSoup(f.read()) f.close() doc = Document() content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] content = "" for tag in content_tags: matches = soup.find_all(tag) for match in matches: if match.string: content += match.string + " " afinn_score = afinn.sentiment(content) doc.add(Field("filepath", file_path, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("sentiment", str(afinn_score), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)