def testCompressionTools(self): bytes = JArray('byte')(self.binaryValCompressed) binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes)) stringFldCompressed = StoredField( "stringCompressed", CompressionTools.compressString(self.binaryValCompressed)) doc = Document() doc.add(binaryFldCompressed) doc.add(stringFldCompressed) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer()) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assertTrue(docFromReader is not None) # fetch the binary compressed field and compare it's content with # the original one bytes = CompressionTools.decompress( docFromReader.getBinaryValue("binaryCompressed")) binaryFldCompressedTest = bytes.string_ self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed) self.assertEqual( CompressionTools.decompressString( docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed) reader.close()
def index_one(self, article): """Create index for one url object in the database. """ try: date_published_str = article['date_published'].strftime( self.date_format) except Exception as e: logger.warning('Error when formating date_published %r: %s ', article['canonical_url'], e) return doc = Document() doc.add(StoredField('group_id', article['group_id'])) doc.add(StoredField('article_id', article['article_id'])) doc.add( StringField('date_published', date_published_str, Field.Store.YES)) doc.add( SortedDocValuesField('date_published', BytesRef(date_published_str))) doc.add(StoredField('date_published', date_published_str)) doc.add(StringField('domain', article['domain'], Field.Store.YES)) doc.add(StringField('site_type', article['site_type'], Field.Store.YES)) doc.add( TextField('canonical_url', article['canonical_url'], Field.Store.YES)) doc.add(TextField('title', article['title'], Field.Store.YES)) doc.add(TextField('meta', article['meta'], Field.Store.NO)) doc.add(TextField('content', article['content'], Field.Store.NO)) doc.add(StoredField('uq_id_str', article['uq_id_str'])) self.writer.addDocument(doc)
def createDocument(item_id, label, viewSimilar, viewProspective): doc = Document() doc.add(StringField('itemID', item_id, Field.Store.YES)) doc.add(StringField('label', label, Field.Store.YES)) for item in viewSimilar: doc.add(StoredField("viewSimilar", item)) for item in viewProspective: doc.add(StoredField("viewProspective", item)) return doc
def create_doc(item_id, label, viewSimilar, viewProspective, model="default"): doc = Document() now_time = int(time.time()) _id = hashlib.md5(f"{label}_{item_id}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(StringField("itemID", item_id, Field.Store.YES)) doc.add(StringField("label", label, Field.Store.YES)) doc.add(StoredField("viewSimilar", viewSimilar)) doc.add(StoredField("viewProspective", viewProspective)) doc.add(StringField("model", model, Field.Store.YES)) doc.add(StringField("ttl", str(now_time), Field.Store.NO)) return _id, doc
def addDoc(w, data): doc = Document() #print ('----------------------------') for field in data: value, type = data[field][0], data[field][1] ''' print ('field:%s type:%s'%(field,type)) print (value+'\n') ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_DF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF)) elif type == 'CUSTOM_FIELD_TEXT_BF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') w.addDocument(doc)
def addDoc(w, data): doc = Document() for field in data: value, type = data[field][0], data[field][1] ''' if type!='INTEGER_STORED': #print ('field=%s len=%d'%(field,len(value))) print ('field=%s value=%s'%(field,value)) else: print ('field=%s value=%d'%(field,value)) ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') try: w.addDocument(doc) except: #print ('error cat=%s'%(data['category'][0])) print('-----------------------------------') for field in data: value, type = data[field][0], data[field][1] print('field=%s\nvalue=%s' % (field, str(value)))
def add_doc(self, item_data): item_id = item_data['item_id'] ttl = item_data['ttl'] version = item_data.get('version', 'default') view_similar = json.dumps(item_data.get('view_similar', {})) view_prospective = json.dumps(item_data.get('view_prospective', {})) doc = Document() _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(LongPoint("ttl", ttl)) doc.add(StringField("version", version, Field.Store.YES)) doc.add(StringField("item_id", item_id, Field.Store.YES)) doc.add(StoredField("view_similar", view_similar)) doc.add(StoredField("view_prospective", view_prospective)) self.writer.updateDocument(Term("id", _id), doc)
def addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract): global batch,cnt_batch #print 'title='+title+ ' category='+category+' skos='+skos_category doc = Document() doc.add(StringField('title',title,Field.Store.YES)) doc.add(TextField('name',name,Field.Store.YES)) doc.add(TextField('value',value,Field.Store.YES)) doc.add(StoredField('category',category)) doc.add(StoredField('skos_category',skos_category)) doc.add(TextField('all_text',all_text,Field.Store.YES)) doc.add(TextField('raw_name',raw_name,Field.Store.YES)) doc.add(TextField('raw_value',raw_value,Field.Store.YES)) doc.add(TextField('abstract',abstract,Field.Store.YES)) #batch.append(doc) #cnt_batch+=1 #if cnt_batch==1000: # w.addDocuments(batch) # cnt_batch=0 # del batch[:] w.addDocument(doc)
def _updateOaiRecord(self, identifier, setSpecs, metadataPrefixes, delete=False, oldDoc=None, deleteInSets=None, deleteInPrefixes=None, _overrideStamp=None): oldDoc = oldDoc or self._getDocument(identifier) doc, oldDeletedSets, oldDeletedPrefixes = self._getNewDocument( identifier, oldDoc=oldDoc) newStamp = _overrideStamp if self._importMode else self._newStamp() doc.add(LongPoint(STAMP_FIELD, int(newStamp))) doc.add( StoredField(STAMP_FIELD, BytesRef(JArray('byte')(int_to_bytes(newStamp))))) doc.add(NumericDocValuesField(NUMERIC_STAMP_FIELD, int(newStamp))) allMetadataPrefixes, allDeletedPrefixes = self._setMetadataPrefixes( doc=doc, metadataPrefixes=asSet(metadataPrefixes), delete=delete, deleteInPrefixes=asSet(deleteInPrefixes), oldDeletedPrefixes=oldDeletedPrefixes) allSets, allDeletedSets = self._setSets(doc=doc, setSpecs=setSpecs or [], delete=delete, deleteInSets=deleteInSets, oldDeletedSets=oldDeletedSets) if delete or (allDeletedSets and allSets == allDeletedSets ) or allMetadataPrefixes == allDeletedPrefixes: doc.add( StringField(TOMBSTONE_FIELD, TOMBSTONE_VALUE, Field.Store.YES)) self._writer.updateDocument(Term(IDENTIFIER_FIELD, identifier), doc) self._latestModifications.add(str(identifier)) self.do.signalOaiUpdate(metadataPrefixes=allMetadataPrefixes, sets=allSets, stamp=newStamp)
def testBinaryFieldInIndex(self): ft = FieldType() ft.setStored(True) bytes = JArray('byte')(self.binaryValStored) binaryFldStored = StoredField("binaryStored", bytes) stringFldStored = Field("stringStored", self.binaryValStored, ft) doc = Document() doc.add(binaryFldStored) doc.add(stringFldStored) # test for field count self.assertEqual(2, doc.fields.size()) # add the doc to a ram index writer = self.getWriter( analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assert_(docFromReader is not None) # fetch the binary stored field and compare it's content with the # original one bytes = docFromReader.getBinaryValue("binaryStored") binaryFldStoredTest = bytes.bytes.string_ self.assertEqual(binaryFldStoredTest, self.binaryValStored) # fetch the string field and compare it's content with the original # one stringFldStoredTest = docFromReader.get("stringStored") self.assertEqual(stringFldStoredTest, self.binaryValStored) reader.close()
def make_document(full_path, unix_timestamp, contents): """ Create Lucene document with specific content. """ doc = Document() # two separate date fields per recommendation # at https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/DateTools.html doc.add(LongPoint('date_for_pointrangequery', int(unix_timestamp))) doc.add(StoredField('last_modified_time', int(unix_timestamp))) # https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/TextField.html # indexed and tokenized doc.add(TextField('fullpath', full_path, Field.Store.YES)) # this is file key but tokenized doc.add(TextField('body', contents, Field.Store.YES)) # It is also possible to add fields that are indexed but not tokenized. # See https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/StringField.html # However there is a limitation: https://stackoverflow.com/a/32654329/130164 # MultiFieldQueryParser will have bizarre results because the query parser runs the analyzer # , while StringField does not run the analyzer. # We deliberately store the key as untokenized so we can search by it directly with a TermQuery. doc.add(StringField('key', full_path, Field.Store.YES)) # this is file key return doc
def build_index(file_dir): indexDir = SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")) config = IndexWriterConfig(WhitespaceAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) # t1 = FieldType() # t1.setStored(True) # t1.setTokenized(False) # t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # # t2 = FieldType() # t2.setStored(True) # t2.setTokenized(True) # t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) print("%d docs in index" % writer.numDocs()) if writer.numDocs(): print("Index already built.") return with open(file_dir + "/train/train.ast.src") as fc: codes = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in fc.readlines() ] for k, code in enumerate(codes): doc = Document() doc.add(StoredField("id", str(k))) doc.add(TextField("code", code, Field.Store.YES)) writer.addDocument(doc) print("Closing index of %d docs..." % writer.numDocs()) writer.close()
def binary(self, b): c = JArray('byte')(b) field = StoredField("bin", c) v = field.binaryValue().bytes assert c == v and b == [a for a in v]
def addStoredFields(doc, metaDict): for key, value in metaDict.items(): value = '' if value is None else value doc.add(StoredField(key, value))
def create_index_from_folder(folder, index_file): """Lets Lucene create an index of all database files within a specified folder :param folder: absolute or relative path to database files :param index_file: absolute or relative output location for index Notes: - Does not go through database folder recursively, i.e. all files have to be at the root of the folder - Only CSV files are supported - Column headers are hardcoded and should follow: ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold """ # Set up Lucene print() print("Starting Lucene ...") lucene.initVM() index_store = SimpleFSDirectory.open(File(index_file).toPath()) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) print() # Go through files, add rows of each as Documents to writer for file in os.listdir(folder): if file.endswith(".csv"): print("Indexing {} ...".format(file), end=" ", flush=True) with open(os.path.join(folder, file), newline='') as db: reader = csv.reader(db) # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those post_ids = set() duplicate_counter = 0 # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) # CSV files have a useless first row... skipfirst = True # ... and a useless first column. Skip both. for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader: if skipfirst: skipfirst = False continue doc = Document() if rid in post_ids: duplicate_counter += 1 continue # skip else: post_ids.add(rid) # Tokenize, index and store doc.add(Field("text", text, customfield)) # Index and store doc.add(StringField("id", rid, Field.Store.YES)) doc.add( StringField("subreddit", subreddit, Field.Store.YES)) doc.add(StringField("meta", meta, Field.Store.YES)) doc.add(StringField("time", time, Field.Store.YES)) doc.add(StringField("author", author, Field.Store.YES)) # Store only doc.add(StoredField("ups", ups)) doc.add(StoredField("downs", downs)) doc.add(StoredField("authorlinkkarma", authorlinkkarma)) doc.add(StoredField("authorkarma", authorkarma)) doc.add(StoredField("authorisgold", authorisgold)) writer.addDocument(doc) print("DONE!\t(Duplicate posts skipped: {})".format( duplicate_counter)) writer.commit() writer.close() print() print("Finished indexing!")
from org.apache.lucene.analysis.standard import StandardAnalyzer if __name__ == "__main__": lucene.initVM() path = Paths.get('index') indexDir = SimpleFSDirectory(path) analyzer = StandardAnalyzer() writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." todo = get_all_rawtext_ids() for n, i in enumerate(todo): try: html = get_rawtext_by_id(i).html root = LH.fromstring(html) text = root.text_content().strip() except: #print "Failed to parse doc" continue doc = Document() # print text doc.add(TextField("text", text, Field.Store.NO)) doc.add(StoredField("id", i)) writer.addDocument(doc) if n % 1000 == 0: print "Indexed %d files (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
pmid_query = TermQuery(Term('pmid', json_doc['pmid'])) pmcid_query = TermQuery(Term('pmcid', json_doc['pmcid'])) id_query = IntPoint.newRangeQuery("id", json_doc['id'], json_doc['id']) bq = BooleanQuery.Builder() bq.add(pmid_query, BooleanClause.Occur.MUST) bq.add(pmcid_query, BooleanClause.Occur.MUST) bq.add(id_query, BooleanClause.Occur.MUST) q = bq.build() writer.deleteDocuments(q) # Add whole abstract. doc = Document() # Store field. doc.add(IntPoint('id', json_doc['id'])) # index doc.add(StoredField('id', json_doc['id'])) # store doc.add(StringField('pmid', json_doc['pmid'], Field.Store.YES)) doc.add( StringField('pmcid', json_doc['pmcid'], Field.Store.YES)) # Index only. doc.add( StringField('article_type', json_doc['article_type'], Field.Store.NO)) doc.add(StringField('type', json_doc['type'], Field.Store.NO)) doc.add( StringField('sec_type', json_doc['sec_type'], Field.Store.NO)) doc.add(Field('text', json_doc['text'], t1)) writer.addDocument(doc) except: