def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt','r',encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url,title,alt] f.close() for src in picDict: doc = Document() doc.add(Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) for line in file: doc = Document() arr = line.split('\t') field = Field("name", arr[2].lower(), Field.Store.YES, Field.Index.TOKENIZED) field.setBoost(1.5) doc.add(field) doc.add(Field("alternate_names", arr[3].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add(Field("state", arr[10].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add(Field("population", arr[14], Field.Store.YES, Field.Index.UN_TOKENIZED)) if int(arr[14]) > 1000000: doc.setBoost(1.2) writer.addDocument(doc) file.close() except Exception, e: print "Failed in indexDocs:", e
def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # Elwood document = Document() document.add( Field("owner", "elwood", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "elwoods sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) # Jake document = Document() document.add( Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "jakes sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) writer.close()
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def addCrowd(self, id, text): doc = Document() doc.add( Field(CrowdFields.id, id, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field(CrowdFields.text, text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.updateDocument(Term(CrowdFields.id, id), doc)
def addDocuments(self, _id, title, content): doc = Document() doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if title is not None and len(title) > 0: doc.add( Field("titleKeyword", title, Field.Store.NO, Field.Index.ANALYZED)) if content is not None and len(content) > 0: doc.add( Field("contentKeyword", content, Field.Store.NO, Field.Index.ANALYZED)) self.index_writer.addDocument(doc)
def addPoint(self, writer, name, type, x, y): doc = Document() doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("x", str(x), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) doc.add( Field("y", str(y), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) writer.addDocument(doc)
def _IndexStringField(doc, field_name, field_content): #print "This is StringField:",field_content if field_content is None: return else: if ("id" == field_name): doc.add( Field(field_name, str(field_content), Field.Store.YES, Field.Index.NOT_ANALYZED)) else: doc.add( Field(field_name, str(field_content), Field.Store.YES, Field.Index.ANALYZED))
def indexDoc(writer, d): doc = Document() name_ = Field("name_", d.name, Field.Store.YES, Field.Index.TOKENIZED) name_.setBoost(2.0) full_text = Field("full_text", d.full, Field.Store.YES, Field.Index.TOKENIZED) id = Field("id", str(d.id), Field.Store.YES, Field.Index.UN_TOKENIZED) doc.add(name_) doc.add(full_text) doc.add(id) writer.addDocument(doc)
def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) reader = StringReader(reader.read()) doc.add(Field("contents", reader, Field.TermVector.YES)) indexWriter.addDocument(doc)
def write_metadata(searcher, reader, document_ids, fname): allFields = set([]) docFields = [] for txtorg_id in document_ids: query = TermQuery(Term('txtorg_id',txtorg_id)) scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs assert len(scoreDocs) == 1 scoreDoc = scoreDocs[0] doc = searcher.doc(scoreDoc.doc) df = {} for f in doc.getFields(): field = Field.cast_(f) df[field.name()] = field.stringValue() docFields.append(df) allFields = allFields.union(set(df.keys())) fields = ['name','path'] + sorted([x for x in allFields if x not in ['name','path']]) with codecs.open(fname, 'w', encoding='UTF-8') as outf: dw = DictUnicodeWriter(outf, fields) # writing header dhead = dict() for k in fields: dhead[k] = k dw.writerow(dhead) # writing data for d in docFields: dw.writerow(d)
def findWildcard(self, word, field='key', max=10): query = WildcardQuery(Term(field, word)) searcher = self.searcher hits = searcher.search(query, None, max) recs = [] fields = self.fields for hit in hits.scoreDocs: # i can't figure out how to deal with ScoreDocs instance # does it already hold doc object? doc = searcher.doc(hit.doc) recs.append(doc) out = [] if fields: for doc in recs: r = {} for f in fields: r[f] = doc.get(f) out.append(r) else: for doc in recs: r = {} for f in doc.fields(): f = Field.cast_(f) r[f.name()] = f.stringValue() out.append(r) return out
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') dir = FSDirectory.getDirectory(indexDir, True) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = System.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add( Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True)
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def reindex_all(reader, writer, analyzer): for i in xrange(reader.maxDoc()): if reader.isDeleted(i): continue doc = reader.document(i) p = doc.get("path") pkid = doc.get('txtorg_id') if p is None: # No filepath specified, just use original document writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer) else: # if a path field is found, try to read the file it points to and add a contents field edited_doc = Document() for f in doc.getFields(): edited_doc.add(Field.cast_(f)) try: inf = open(p) contents = unicode(inf.read(), 'UTF-8') inf.close() if len(contents) > 0: edited_doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)) else: print "warning: no content in %s" % filename except: print "Could not read file; skipping" writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_STORED = Field.Store.NO TOKENIZED = Field.Index.TOKENIZED UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) doc.add( Field("contents", value, UN_STORED, TOKENIZED, Field.TermVector.YES)) indexWriter.addDocument(doc)
def testFuzzy(self): self.indexSingleFieldDocs([Field("contents", "fuzzy", Field.Store.YES, Field.Index.ANALYZED), Field("contents", "wuzzy", Field.Store.YES, Field.Index.ANALYZED)]) searcher = IndexSearcher(self.directory) query = FuzzyQuery(Term("contents", "wuzza")) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(2, len(scoreDocs), "both close enough") self.assert_(scoreDocs[0].score != scoreDocs[1].score, "wuzzy closer than fuzzy") self.assertEqual("wuzzy", searcher.doc(scoreDocs[0].doc).get("contents"), "wuzza bear")
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def indexFile(self, writer, path): try: file = open(path) string = HTMLReader(InputStreamReader(file, 'utf-8')).read() file.close() except: raise else: doc = Document() doc.add(Field("contents", StringReader(string))) doc.add( Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) return doc
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) for line in file: doc = Document() arr = line.split('\t') field = Field("name", arr[2].lower(), Field.Store.YES, Field.Index.TOKENIZED) field.setBoost(1.5) doc.add(field) doc.add( Field("alternate_names", arr[3].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("state", arr[10].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("population", arr[14], Field.Store.YES, Field.Index.UN_TOKENIZED)) if int(arr[14]) > 1000000: doc.setBoost(1.2) writer.addDocument(doc) file.close() except Exception, e: print "Failed in indexDocs:", e
def _addDoc(self, text, writer): """ function to add documents in the lucene index. text fields are indexed by the name "field" """ doc = Document() doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def get_fields_and_values(reader, max_vals = 30): all_fields = defaultdict(set) for i in xrange(reader.maxDoc()): if reader.isDeleted(i): continue doc = reader.document(i) for f in doc.getFields(): field = Field.cast_(f) if len(all_fields[field.name()]) < max_vals: all_fields[field.name()].add(field.stringValue()) return dict(all_fields)
def addDocuments(self, dir, maxFieldLength): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength(maxFieldLength)) for keyword, unindexed, unstored, text in \ izip(self.keywords, self.unindexed, self.unstored, self.text): doc = Document() doc.add(Field("id", keyword, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", unindexed, Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", unstored, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()
def parse_file(file_path, writer): f = open(file_path, 'r') soup = BeautifulSoup(f.read()) f.close() doc = Document() content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] content = "" for tag in content_tags: matches = soup.find_all(tag) for match in matches: if match.string: content += match.string + " " afinn_score = afinn.sentiment(content) doc.add(Field("filepath", file_path, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("sentiment", str(afinn_score), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def indexFile(self, writer, path): doc = Document() try: process = popen2.Popen4(["antiword", "-m", "UTF-8", path]) string = InputStreamReader(process.fromchild, 'utf-8').read() except: raise else: doc.add(Field("contents", StringReader(string))) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) exitCode = process.wait() if exitCode != 0: raise RuntimeError, "pdftotext exit code %d" %(exitCode) return doc
def testWildcard(self): self.indexSingleFieldDocs([Field("contents", "wild", Field.Store.YES, Field.Index.ANALYZED), Field("contents", "child", Field.Store.YES, Field.Index.ANALYZED), Field("contents", "mild", Field.Store.YES, Field.Index.ANALYZED), Field("contents", "mildew", Field.Store.YES, Field.Index.ANALYZED)]) searcher = IndexSearcher(self.directory) query = WildcardQuery(Term("contents", "?ild*")) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(3, len(scoreDocs), "child no match") self.assertEqual(scoreDocs[0].score, scoreDocs[1].score, "score the same") self.assertEqual(scoreDocs[1].score, scoreDocs[1].score, "score the same")
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def someMethod(self): directory = RAMDirectory() analyzer = StandardAnalyzer() writer = IndexWriter(directory, analyzer, True) doc = Document() doc.add(Field.Text("title", "This is the title")) doc.add(Field.UnStored("contents", "...document contents...")) writer.addDocument(doc) writer.addDocument(doc, analyzer) expression = "some query" query = QueryParser.parse(expression, "contents", analyzer) parser = QueryParser("contents", analyzer) query = parser.parseQuery(expression)
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.porterAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("contents", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def write_contents(allDicts, searcher, reader, fname, content_field = "contents"): all_ids = [d['txtorg_id'] for d in allDicts] all_fields = set() doc_fields = [] for txtorg_id in all_ids: query = TermQuery(Term('txtorg_id',txtorg_id)) scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs assert len(scoreDocs) == 1 scoreDoc = scoreDocs[0] doc = searcher.doc(scoreDoc.doc) df = {} name_path_present = False failFlag = False for f in doc.getFields(): field = Field.cast_(f) if content_field == "contents" and field.name() == 'path': name_path_present = True path = doc.get("path").encode('utf-8') try: i = codecs.open(path, 'r', encoding='UTF-8') c = i.read() df[content_field] = c i.close() except Exception as e: failFlag = True print "Failed for path %s with exception %s" % (path, e) elif field.name() in ['txtorg_id', 'name', 'path', content_field]: df[field.name()] = field.stringValue() all_fields = all_fields.union(set(df.keys())) doc_fields.append(df) fields = ['txtorg_id'] + sorted([x for x in all_fields if x != 'txtorg_id']) with codecs.open(fname, 'w', encoding='UTF-8') as outf: dw = csv.DictWriter(outf, fields) dw.writeheader() # writing data for d in doc_fields: dw.writerow(d) return failFlag
def add_metadata_to_doc(lucenedoc,fieldnames,values): edited_doc = Document() filepath = lucenedoc.get("path") assert filepath is not None # Include all original fields that are not in the list of updates original_fields = [] for f in lucenedoc.getFields(): field = Field.cast_(f) if field.name() not in fieldnames: original_fields.append(field) for field in original_fields: edited_doc.add(field) # Now, add back the unstored "contents" field try: file = open(filepath) contents = unicode(file.read(), 'UTF-8') file.close() if len(contents) > 0: edited_doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)) else: print "warning: no content in %s" % filename except: print "Could not read file; skipping" return None # Now include new fields for idx in range(len(fieldnames)): edited_doc.add(Field(fieldnames[idx].lower(),values[idx].lower(),Field.Store.YES,Field.Index.NOT_ANALYZED)) return edited_doc
def index_file(self, path): print path YES = Field.Store.YES NO = Field.Store.NO NOT_ANALYZED = Field.Index.NOT_ANALYZED ANALYZED = Field.Index.ANALYZED metadata = self.metadata handle = codecs.open(path, encoding='utf8') filename = os.path.basename(path) if metadata: user_number = WebPage.get(filename).user user_record = list(User.select(User.q.number == user_number))[0] gender = (user_record.sex if user_record.sex is not None else "Unknown") birthyear = (str(user_record.birthyear) if user_record.birthyear is not None else '0') s = handle.read() pos = s.find(u'\n\n') + 2 # Skip first block: declaration nextpos = 0 sentence_index = 0 doc = Document() sentence_index_field = Field("sentence_index", str(sentence_index), YES, NOT_ANALYZED) user_field = Field("user", '', YES, NOT_ANALYZED) gender_field = Field("gender", '', YES, NOT_ANALYZED) birthyear_field = Field("birthyear", '', YES, NOT_ANALYZED) filename_field = Field("filename", '', YES, NOT_ANALYZED) store_contents = NO if self.compress else YES contents_field = Field("contents", '', store_contents, ANALYZED) # change this to Field('compressed', '', NO)? Second argument needs to # be bytes, so maybe '\xfe' if self.compress: compressed_field = Field("compressed", '\xfe', YES, NOT_ANALYZED) doc.add(sentence_index_field) if metadata: doc.add(user_field) doc.add(gender_field) doc.add(birthyear_field) doc.add(filename_field) doc.add(contents_field) if self.compress: doc.add(compressed_field) while nextpos != len(s): nextpos = s.find(u'\n \n', pos) if nextpos == -1: nextpos = len(s) text = s[pos:nextpos] text = text.replace(u' ', u'@') pos = nextpos + 2 sentence_index_field.setValue(str(sentence_index)) if metadata: user_field.setValue(user_number) gender_field.setValue(gender) birthyear_field.setValue(birthyear) filename_field.setValue(filename) contents_field.setValue(text) # todo: # remove header and checksum added by zlib # make sure what gets stored is binary rather than unicode if self.compress: compressed = self.compressor.compress(text.encode('utf8')) compressed_field.setValue(compressed) self.writer.addDocument(doc) sentence_index = sentence_index + 1