def setUp(self): super(PyLuceneThreadTestCase, self).setUp() self.classLoader = Thread.currentThread().getContextClassLoader() writer = self.getWriter(analyzer=StandardAnalyzer()) doc1 = Document() doc2 = Document() doc3 = Document() doc4 = Document() doc1.add(Field("field", "one", TextField.TYPE_STORED)) doc2.add(Field("field", "two", TextField.TYPE_STORED)) doc3.add(Field("field", "three", TextField.TYPE_STORED)) doc4.add(Field("field", "one", TextField.TYPE_STORED)) writer.addDocument(doc1) writer.addDocument(doc2) writer.addDocument(doc3) writer.addDocument(doc4) writer.commit() writer.close() self.testData = [('one', 2), ('two', 1), ('three', 1), ('five', 0)] * 500 self.lock = threading.Lock() self.totalQueries = 0
def index(personDB, familyDB, relationDB): #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) #?#indexWriter.setRAMBufferSizeMB(50); KOLLA 256 mt = matchtext() for p in personDB.find({}, no_cursor_timeout=True): matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED)) doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) #Family matchtext for f in familyDB.find(): matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB) doc = Document() doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED)) doc.add(Field('sex','FAM', StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def setUp(self): super(Test_Bug1763, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.d1 = RAMDirectory() self.d2 = RAMDirectory() w1, w2 = [ self.getWriter(directory=d, analyzer=self.analyzer) for d in [self.d1, self.d2] ] doc1 = Document() doc2 = Document() doc1.add( Field("all", "blah blah double blah Gesundheit", TextField.TYPE_NOT_STORED)) doc1.add(Field('id', '1', StoredField.TYPE)) doc2.add( Field("all", "a quick brown test ran over the lazy data", TextField.TYPE_NOT_STORED)) doc2.add(Field('id', '2', StoredField.TYPE)) w1.addDocument(doc1) w2.addDocument(doc2) for w in [w1, w2]: w.close()
def get_document(fname, split_by=None): docs = [] _name = os.path.split(fname)[-1] with open(fname) as f: contents = f.read() if split_by: paragraphs = contents.split(split_by) for ix, par in enumerate(paragraphs): if not par: continue doc = Document() name = "{}_{}".format(_name, ix) doc.add( Field('filename', name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', par, Field.Store.YES, Field.Index.ANALYZED)) docs.append(doc) else: doc = Document() doc.add( Field('filename', _name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', contents, Field.Store.YES, Field.Index.ANALYZED)) docs.append(doc) return docs
def build(self, index): writer = self.getWriter(directory=index.index, analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT)) seed(101) for d in xrange(self.minId, self.maxId + 1): doc = Document() doc.add(Field("id", self.pad(d), StringField.TYPE_STORED)) if index.allowNegativeRandomInts: r = randint(~self.MAX_INT, self.MAX_INT) else: r = randint(0, self.MAX_INT) if index.maxR < r: index.maxR = r if r < index.minR: index.minR = r doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED)) doc.add(Field("body", "body", StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close()
def addDoc(w, data): doc = Document() #print ('----------------------------') for field in data: value, type = data[field][0], data[field][1] ''' print ('field:%s type:%s'%(field,type)) print (value+'\n') ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_DF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF)) elif type == 'CUSTOM_FIELD_TEXT_BF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') w.addDocument(doc)
def generate_indices_from_benchmark(writer, counter): javafiles = java_files_from_dir(source_path) i = 0 j = 0 for javafile in javafiles: # print javafile i += 1 if i % 1000 == 0: #1000개 마다 프린트 print("Counter: %s" % i) print "typed_method_call" + str(counter.typed_method_call_count) document = Document() document.add(Field("file", javafile, Field.Store.YES, Field.Index.NO)) try: with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f: file_content = f.read().encode("utf-8", errors='ignore') f.close() ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast, counter): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print "%s files has been indexed" % j
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) index_file = open("index.txt", 'r') for line in index_file.readlines(): try: src = line.strip().split('\t')[0] filename = line.strip().split('\t')[1] tag = line.strip().split('\t')[2] path = os.path.join(root, filename) doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) doc.add(Field("src", src, t1)) if len(tag) > 0: doc.add(Field("tag", tag, t2)) else: print "warning: no tag in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDictionary(d, writer): for k, v in d.iteritems(): doc = Document() doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) return writer.numDocs()
def index_docs(root, writer): # metadata: name and path metadata = FieldType() metadata.setStored(True) # as is value metadata.setTokenized(False) metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # content: abstract and body content_type = FieldType() content_type.setStored(True) # to highlight on search results content_type.setTokenized(True) # tokenize words content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for directory, _, file_names in walk(root): for file_name in file_names: name, extension = splitext(file_name) if extension not in DOC_FORMATS: continue # skip unsupported formats file_path = join(directory, file_name) print ' ', file_path # Build indexed document doc = Document() doc.add(Field('name', file_name, metadata)) doc.add(Field('path', directory, metadata)) # Read file contents content = process(file_path, 'utf-8', method='pdfminer') abstract = extract_abstract(content) doc.add(Field('content', content, content_type)) doc.add(Field('abstract', abstract, content_type)) writer.addDocument(doc)
def addDoc(self, writer, text): d = Document() f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED) d.add(f) writer.addDocument(d)
def index_single_file(self, doc_file): logger.info("adding {}".format(doc_file)) lucene_doc_num = 0 try: with open(doc_file) as df: for line in df: wiki_doc = json.loads(line) doc_title = wiki_doc['title'] doc_text = wiki_doc['plaintext'] doc_id = wiki_doc['_id'] paragraphs = doc_text.split('\n\n') if len(paragraphs) < 3: continue doc_text = rm_special_chars(doc_text) doc = Document() doc.add(StringField("id", str(doc_id), Field.Store.YES)) doc.add(TextField("title", doc_title, Field.Store.YES)) doc.add(TextField("text", doc_text, Field.Store.YES)) self.writer.addDocument(doc) lucene_doc_num += 1 if lucene_doc_num % 10000 == 0: logger.info('added {} lucene docs'.format(lucene_doc_num)) except Exception as e: import traceback traceback.print_tb(e.__traceback__) logger.error("Failed in: {}".format(doc_file)) return lucene_doc_num
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) inFile = open(str(args["inputFile"])) indexName = inFile.readline() while (indexName != ''): print "adding", indexName doc = Document() doc.add(Field("name", indexName, t1)) #doc.add(Field("path", root, t1)) text = inFile.readline() if (len(text) > 0): print("contents: %s\n" % text) doc.add(Field("contents", text, t2)) else: print "warning: no content in %s" % indexName indexName = inFile.readline() writer.addDocument(doc) inFile.close()
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: print("adding", filename) path = os.path.join(root, filename) file = open(path, encoding='utf8') url = file.readline() title = file.readline() contents = file.read() file.close() img_url = self.getTxtAttribute(contents, 'img_url') img_info = self.getTxtAttribute(contents, 'img_info') for i in range(len(img_url)): if len(img_info[i]) > 0: title = title doc = Document() doc.add(StringField('title', title, Field.Store.YES)) doc.add(StringField('url', url, Field.Store.YES)) doc.add( StringField('img_url', img_url[i], Field.Store.YES)) seg_contents = jieba.lcut_for_search(img_info[i]) contents = ' '.join(seg_contents) doc.add( TextField('contents', contents, Field.Store.YES)) writer.addDocument(doc) else: continue
def indexDocs(self, sourceDir, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for sourceDir, dirnames, filenames in os.walk(sourceDir): for filename in filenames: if not filename.endswith('.txt'): continue print(filename) try: path = os.path.join(sourceDir, filename) file = open(path, 'r', encoding="utf-8") contents = file.read() #contents = str(filecontent, 'utf-8') #contents = filecontent.encode('utf-8') #print('path', path, len(contents)) doc = Document() doc.add(Field("name", filename, t1)) # filename (title) #doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field(queryField, contents, t2)) # content else: print("warning: no content in %s" % filename) writer.addDocument(doc) file.close() except NameError: print("Failed in indexDocs:")
def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexer(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) def repalcer(text): chars = '\\`*_{}[]()>#+-.!$‘' for c in chars: if c in text: text = text.replace(c, ' ') return text for root, dirnames, filenames in os.walk(root): i = 0 for filename in filenames: i += 1 with open(os.path.join(root, filename)) as f: for line in f.readlines(): line = line.split(' ', 2) docname = line[0] + ' ' + line[1] name = repalcer(line[0]) contents = line[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i)
def indexDocs(self, root, writer): path = root + "/data/*/*.xml" # print(path) xml_files = glob.glob(path) # xml_files = ["HAM2-031201.xml"] numDocs = 0 for xml in xml_files: try: parser = etree.XMLParser(recover=False, strip_cdata=False) tree = etree.parse(xml, parser=parser) except etree.XMLSyntaxError as e: parser = etree.XMLParser(recover=True, strip_cdata=False) tree = etree.parse(xml, parser=parser) root = tree.getroot() for text in root.iter("TEXT"): contents = "".join(text.xpath("text()")).strip() doc_no = text.getparent().find("DOCNO").text # print("adding", doc_no) try: doc = Document() doc.add(StringField("id", doc_no, Field.Store.YES)) if len(contents) > 0: doc.add( TextField("contents", contents, Field.Store.YES)) else: pass # print("warning: no content in %s" % doc_no) writer.addDocument(doc) numDocs += 1 except Exception as e: print("Failed in indexDocs:", e) return numDocs
def test_indexDocumentWithUnicodeText(self): store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, True) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(1), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add( Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) # using a unicode body cause problems, which seems very odd # since the python type is the same regardless affter doing # the encode body_text = u"hello world" * 20 body_reader = StringReader(body_text) doc.add(Field("content", body_reader, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer)
def getDoc(self, file): try: f = open(os.getcwd()+FILE_DIR+'/'+file, "r") try: c = [] s = BeautifulSoup(f, 'html.parser') text = s.findAll(text=True) c = filter(tag_vis, text) try: c = ' '.join(c) except Exception as e: c = b' '.join(c) except Exception as e: print(str(e)) return content = TextField("contents", c, Field.Store.YES) fileName = str(Paths.get(file)).split('/')[-1] fileName = fileName[:fileName.find(".")] filename = TextField("filename", fileName, Field.Store.YES) path = TextField("filepath", str(os.getcwd()+FILE_DIR+'/'+file), Field.Store.NO) doc = Document() doc.add(content) doc.add(filename) doc.add(path) return doc except Exception as e: print(type(Exception).__name__) print(str(e)) return
def indexsents(self, sentences, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for i, sent in enumerate(sentences): #print "adding",i, sent try: root = os.getcwd() #contents = unicode(sent, 'iso-8859-1') doc = Document() doc.add(Field("name", str(i), t1)) doc.add(Field("path", root, t1)) if len(sent) > 0: doc.add(Field("contents", sent.lower(), t2)) else: print "warning: no content in %s" % str(i) writer.addDocument(doc) except Exception, e: print "Failed in indexsents:", e
def Indexing(self, writer): print("Indexing Segmented File [", SEGMENTATION_FILE, "]") with open(SEGMENTATION_FILE, 'r') as f: line_count = 0 for line in f: # 建立 context 的 fieldtype,需要搭建索引、存储、向量化 fieldtype_context = FieldType() fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS) fieldtype_context.setStored(True) fieldtype_context.setTokenized(True) # 建立 phrase 的 fieldtype,只需要保存 fieldtype_phrase = FieldType() fieldtype_phrase.setStored(True) # 对分词好的内容进行处理,把词语和词性分开来存储 processed_context, processed_phrase = self.process_line(line) doc = Document() # context field是用于记录文章的内容 doc.add(Field('context', processed_context, fieldtype_context)) # phrase field适用于记录文章每个词所对应的词性 doc.add(Field('phrase', processed_phrase, fieldtype_phrase)) # 把document写入索引库 writer.addDocument(doc) # 跟踪程序运行情况用 print("\r", str(line_count), " lines", end="", flush=True) line_count = line_count + 1 if line_count > self.index_limit and not self.training: break writer.close() print()
def build_index(self, dict_data): print("loading data...") t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for k, v in dict_data.items(): doc = Document() doc.add(Field("id", k, t1)) doc.add(Field("content", v, t2)) self.writer.addDocument(doc) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() self.writer.commit() self.writer.close() ticker.tick = False print("done")
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t3 = FieldType() t3.setIndexed(True) t3.setStored(False) t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格 t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) total=0 file = open(root,"r") for line in file.readlines(): try: imgurl, itemurl, content = line.split('\t') total+=1 print total print "adding", content contents = ' '.join(jieba.cut(content)) doc = Document() doc.add(Field("imgurl", imgurl, t1)) doc.add(Field("itemurl", itemurl, t1)) doc.add(Field("title", content, t1)) doc.add(Field("contents",contents,t3)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def to_document(self): def bool_to_int(bv): if bv: return 1 return 0 doc = Document() doc.add(StringField('func_name', self.func_name, Field.Store.YES)) doc.add( StringField('id_isint', str(bool_to_int(isinstance(self.p_id, (int, long)))), Field.Store.YES)) if isinstance(self.p_id, unicode): doc.add( StringField('id', self.p_id.encode('utf8'), Field.Store.YES)) else: doc.add(StringField('id', str(self.p_id), Field.Store.YES)) doc.add( StringField( 'start_isint', str(bool_to_int(isinstance(self.p_start, (int, long)))), Field.Store.YES)) doc.add(StringField('start', str(self.p_start), Field.Store.YES)) doc.add(StringField('pagesize', str(self.p_pagesize), Field.Store.YES)) doc.add(StringField('pextra', str(self.p_extra), Field.Store.YES)) doc.add(StringField('fails', str(self.fails), Field.Store.YES)) doc.add( StringField('finish_time', str(self.finish_time), Field.Store.YES)) doc.add(StringField('docid', str(self.docid), Field.Store.YES)) return doc
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'gbk') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def testCompressionTools(self): bytes = JArray('byte')(self.binaryValCompressed) binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes)) stringFldCompressed = StoredField( "stringCompressed", CompressionTools.compressString(self.binaryValCompressed)) doc = Document() doc.add(binaryFldCompressed) doc.add(stringFldCompressed) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer()) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assertTrue(docFromReader is not None) # fetch the binary compressed field and compare it's content with # the original one bytes = CompressionTools.decompress( docFromReader.getBinaryValue("binaryCompressed")) binaryFldCompressedTest = bytes.string_ self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed) self.assertEqual( CompressionTools.decompressString( docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed) reader.close()
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): # traverse through the doc directory for filename in filenames: # if not filename.endswith('.cdc'): # continue try: # only add the filename and path for indexing path = os.path.join(root, filename) print "adding file : ", path file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in ", filename writer.addDocument(doc) except Exception, e: print "failed in indexDocs:", e
def index_one(self, article): """Create index for one url object in the database. """ try: date_published_str = article['date_published'].strftime( self.date_format) except Exception as e: logger.warning('Error when formating date_published %r: %s ', article['canonical_url'], e) return doc = Document() doc.add(StoredField('group_id', article['group_id'])) doc.add(StoredField('article_id', article['article_id'])) doc.add( StringField('date_published', date_published_str, Field.Store.YES)) doc.add( SortedDocValuesField('date_published', BytesRef(date_published_str))) doc.add(StoredField('date_published', date_published_str)) doc.add(StringField('domain', article['domain'], Field.Store.YES)) doc.add(StringField('site_type', article['site_type'], Field.Store.YES)) doc.add( TextField('canonical_url', article['canonical_url'], Field.Store.YES)) doc.add(TextField('title', article['title'], Field.Store.YES)) doc.add(TextField('meta', article['meta'], Field.Store.NO)) doc.add(TextField('content', article['content'], Field.Store.NO)) doc.add(StoredField('uq_id_str', article['uq_id_str'])) self.writer.addDocument(doc)