def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # Elwood document = Document() document.add( Field("owner", "elwood", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "elwoods sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) # Jake document = Document() document.add( Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "jakes sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) writer.close()
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') dir = FSDirectory.getDirectory(indexDir, True) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = System.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add( Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) for line in file: doc = Document() arr = line.split('\t') field = Field("name", arr[2].lower(), Field.Store.YES, Field.Index.TOKENIZED) field.setBoost(1.5) doc.add(field) doc.add( Field("alternate_names", arr[3].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("state", arr[10].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("population", arr[14], Field.Store.YES, Field.Index.UN_TOKENIZED)) if int(arr[14]) > 1000000: doc.setBoost(1.2) writer.addDocument(doc) file.close() except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt','r',encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url,title,alt] f.close() for src in picDict: doc = Document() doc.add(Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def _addDoc(self, text, writer): """ function to add documents in the lucene index. text fields are indexed by the name "field" """ doc = Document() doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def addCrowd(self, id, text): doc = Document() doc.add( Field(CrowdFields.id, id, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field(CrowdFields.text, text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.updateDocument(Term(CrowdFields.id, id), doc)
def _IndexField(field_list, content): i = 0 doc = Document() while i < len(field_list): if (field_list[i]['StringField'] is not False): _IndexStringField(doc, field_list[i]['FieldName'], content[i]) elif (field_list[i]['NumericField'] is not False): _IndexNumericField(doc, field_list[i]['FieldName'], content[i]) i += 1 return doc
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def addDocuments(self, _id, title, content): doc = Document() doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if title is not None and len(title) > 0: doc.add( Field("titleKeyword", title, Field.Store.NO, Field.Index.ANALYZED)) if content is not None and len(content) > 0: doc.add( Field("contentKeyword", content, Field.Store.NO, Field.Index.ANALYZED)) self.index_writer.addDocument(doc)
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.porterAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("contents", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def indexFile(dir, filename): path = os.path.join(dir, filename) print " File: ", filename if filename.endswith('.gz'): child = Popen('gunzip -c ' + path + ' | groff -t -e -E -mandoc -Tascii | col -bx', shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout command, section = re.search('^(.*)\.(.*)\.gz$', filename).groups() else: child = Popen('groff -t -e -E -mandoc -Tascii ' + path + ' | col -bx', shell=True, stdout=PIPE, cwd=os.path.dirname(dir)).stdout command, section = re.search('^(.*)\.(.*)$', filename).groups() data = child.read() err = child.close() if err: raise RuntimeError, '%s failed with exit code %d' % (command, err) matches = re.search('^NAME$(.*?)^\S', data, re.MULTILINE | re.DOTALL) name = matches and matches.group(1) or '' matches = re.search('^(?:SYNOPSIS|SYNOPSYS)$(.*?)^\S', data, re.MULTILINE | re.DOTALL) synopsis = matches and matches.group(1) or '' matches = re.search('^(?:DESCRIPTION|OVERVIEW)$(.*?)', data, re.MULTILINE | re.DOTALL) description = matches and matches.group(1) or '' doc = Document() doc.add( Field("command", command, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("section", section, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("name", name.strip(), Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("synopsis", synopsis.strip(), Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("keywords", ' '.join((command, name, synopsis, description)), Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc)
def addPoint(self, writer, name, type, x, y): doc = Document() doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("x", str(x), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) doc.add( Field("y", str(y), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) writer.addDocument(doc)
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def setUp(self): self.analyzer = WhitespaceAnalyzer() self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for i in xrange(1, 501): doc = Document() doc.add(Field("id", NumberUtils.pad(i), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.close()
def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) reader = StringReader(reader.read()) doc.add(Field("contents", reader, Field.TermVector.YES)) indexWriter.addDocument(doc)
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True)
def write_index(self, workflow, property = False): """ adds all keywords in workflow to the index at the specified location types of the keywords can be preserved by setting (Property = True) """ self.ddict = dict() # name is used as id in this case self.adddd("workflow_id", str(workflow.id)) self.adddd("text", workflow.id) self.adddd("text", workflow.name) # this is a key for the workflow #adddd("workflow_source", workflow.source) #self.adddd("text", workflow.source) #adddd("workflow_type", workflow.type) self.adddd("text", workflow.type) # not very interesting #d.add( Field("workflow_version", workflow.version, save, Field.Index.UN_TOKENIZED)) self.indexAnnotations(workflow.annotations, property) for module in workflow.modules: self.adddd("module_name" if property else "text", module.name) self.adddd("package" if property else "text", module.package) # not very interesting #d.add( Field("package_version", module.version, save, Field.Index.UN_TOKENIZED)) self.adddd("module_type" if property else "text", module.type) self.indexAnnotations(module.annotations, property) for p in module.parameters: self.adddd("parameter_name" if property else "text", p.name) self.adddd("parameter_value" if property else "text", p.value) self.adddd("parameter_type" if property else "text", p.type) self.indexAnnotations(p.annotations, property) for c in workflow.connections: self.adddd("port_name" if property else "text", c.startPort) self.adddd("port_name" if property else "text", c.endPort) self.indexAnnotations(c.annotations, property) d = Document() for (k, v) in self.ddict.iteritems(): d.add(Field(k, v, self.save, Field.Index.TOKENIZED)) # Delete old versions WorkflowIndexer.writer.deleteDocuments( [Term('workflow_id', str(workflow.id))] ) # add new WorkflowIndexer.writer.addDocument(d)
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def indexDocs(self, root, writer): f = codecs.open('infoIndex.txt', 'r', encoding='utf-8') files = {} for line in f.xreadlines(): ls = line.split() files[ls[0] + '.txt'] = [ls[1], ls[2]] f.close() for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print("adding"), filename # try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add( Field("name", filename, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("path", path, Field.Store.YES, Field.Index.NOT_ANALYZED)) url = files[filename][0] doc.add( Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED)) domin = urlparse.urlsplit(url)[1].split(':')[0] doc.add( Field("site", domin, Field.Store.YES, Field.Index.NOT_ANALYZED)) title = files[filename][1] doc.add( Field("title", title, Field.Store.YES, Field.Index.NOT_ANALYZED)) print filename, path, url, domin, title if len(contents) > 0: doc.add( Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED)) else: print("warning: no content in %s" % filename) writer.addDocument(doc)
def indexFile(self, writer, path): try: file = open(path) string = HTMLReader(InputStreamReader(file, 'utf-8')).read() file.close() except: raise else: doc = Document() doc.add(Field("contents", StringReader(string))) doc.add( Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) return doc
def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_STORED = Field.Store.NO TOKENIZED = Field.Index.TOKENIZED UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) doc.add( Field("contents", value, UN_STORED, TOKENIZED, Field.TermVector.YES)) indexWriter.addDocument(doc)