def reindex_all(reader, writer, analyzer): for i in xrange(reader.maxDoc()): if reader.isDeleted(i): continue doc = reader.document(i) p = doc.get("path") pkid = doc.get('txtorg_id') if p is None: # No filepath specified, just use original document writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer) else: # if a path field is found, try to read the file it points to and add a contents field edited_doc = Document() for f in doc.getFields(): edited_doc.add(Field.cast_(f)) try: inf = open(p) contents = unicode(inf.read(), 'UTF-8') inf.close() if len(contents) > 0: edited_doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)) else: print "warning: no content in %s" % filename except: print "Could not read file; skipping" writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
def write_metadata(searcher, reader, document_ids, fname): allFields = set([]) docFields = [] for txtorg_id in document_ids: query = TermQuery(Term('txtorg_id',txtorg_id)) scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs assert len(scoreDocs) == 1 scoreDoc = scoreDocs[0] doc = searcher.doc(scoreDoc.doc) df = {} for f in doc.getFields(): field = Field.cast_(f) df[field.name()] = field.stringValue() docFields.append(df) allFields = allFields.union(set(df.keys())) fields = ['name','path'] + sorted([x for x in allFields if x not in ['name','path']]) with codecs.open(fname, 'w', encoding='UTF-8') as outf: dw = DictUnicodeWriter(outf, fields) # writing header dhead = dict() for k in fields: dhead[k] = k dw.writerow(dhead) # writing data for d in docFields: dw.writerow(d)
def findWildcard(self, word, field='key', max=10): query = WildcardQuery(Term(field, word)) searcher = self.searcher hits = searcher.search(query, None, max) recs = [] fields = self.fields for hit in hits.scoreDocs: # i can't figure out how to deal with ScoreDocs instance # does it already hold doc object? doc = searcher.doc(hit.doc) recs.append(doc) out = [] if fields: for doc in recs: r = {} for f in fields: r[f] = doc.get(f) out.append(r) else: for doc in recs: r = {} for f in doc.fields(): f = Field.cast_(f) r[f.name()] = f.stringValue() out.append(r) return out
def get_fields_and_values(reader, max_vals = 30): all_fields = defaultdict(set) for i in xrange(reader.maxDoc()): if reader.isDeleted(i): continue doc = reader.document(i) for f in doc.getFields(): field = Field.cast_(f) if len(all_fields[field.name()]) < max_vals: all_fields[field.name()].add(field.stringValue()) return dict(all_fields)
def write_contents(allDicts, searcher, reader, fname, content_field = "contents"): all_ids = [d['txtorg_id'] for d in allDicts] all_fields = set() doc_fields = [] for txtorg_id in all_ids: query = TermQuery(Term('txtorg_id',txtorg_id)) scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs assert len(scoreDocs) == 1 scoreDoc = scoreDocs[0] doc = searcher.doc(scoreDoc.doc) df = {} name_path_present = False failFlag = False for f in doc.getFields(): field = Field.cast_(f) if content_field == "contents" and field.name() == 'path': name_path_present = True path = doc.get("path").encode('utf-8') try: i = codecs.open(path, 'r', encoding='UTF-8') c = i.read() df[content_field] = c i.close() except Exception as e: failFlag = True print "Failed for path %s with exception %s" % (path, e) elif field.name() in ['txtorg_id', 'name', 'path', content_field]: df[field.name()] = field.stringValue() all_fields = all_fields.union(set(df.keys())) doc_fields.append(df) fields = ['txtorg_id'] + sorted([x for x in all_fields if x != 'txtorg_id']) with codecs.open(fname, 'w', encoding='UTF-8') as outf: dw = csv.DictWriter(outf, fields) dw.writeheader() # writing data for d in doc_fields: dw.writerow(d) return failFlag
def add_metadata_to_doc(lucenedoc,fieldnames,values): edited_doc = Document() filepath = lucenedoc.get("path") assert filepath is not None # Include all original fields that are not in the list of updates original_fields = [] for f in lucenedoc.getFields(): field = Field.cast_(f) if field.name() not in fieldnames: original_fields.append(field) for field in original_fields: edited_doc.add(field) # Now, add back the unstored "contents" field try: file = open(filepath) contents = unicode(file.read(), 'UTF-8') file.close() if len(contents) > 0: edited_doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)) else: print "warning: no content in %s" % filename except: print "Could not read file; skipping" return None # Now include new fields for idx in range(len(fieldnames)): edited_doc.add(Field(fieldnames[idx].lower(),values[idx].lower(),Field.Store.YES,Field.Index.NOT_ANALYZED)) return edited_doc