def add_field_to_document(document, field_name, value, store, analyzed, boost=None): """ Adds a field to the passed in Lucene document. If the boost kwarg is passed in the field will be boosted by the specified value. """ field = lucene.Field(field_name, value, store, analyzed) if boost: field.setBoost(boost) document.add(field)
def get_word_list(text, is_list=False, field_name = 'fieldname'): if is_list: new_text = "" for i in text: new_text += i + "\n" text = new_text lucene.initVM(lucene.CLASSPATH) analyzer = lucene.KoreanAnalyzer(); #directory = lucene.FSDirectory.open("/tmp/testindex"); directory = lucene.RAMDirectory() # writer writer = lucene.IndexWriter(directory, analyzer) doc = lucene.Document() doc.add(lucene.Field(field_name, text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); # get all terms from all index ireader = lucene.IndexReader.open(directory, False) term = lucene.Term(field_name, '') termenum = ireader.terms(term) term = termenum.term() i = 0 word_list = [] while term and term.field() == field_name: i += 1 termDocs = ireader.termDocs(term) termDocs.next() #print "[%04d]===> <%s> " % (i, term.text()) #print term.text() + " : " + str(termDocs.freq()) word_list.append({'text': term.text(), 'freq': termDocs.freq()}) term = termenum.next() and termenum.term() ireader.close(); directory.close(); return word_list
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = file.read().decode('utf8', 'ignore') file.close() doc = lucene.Document() doc.add( lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: title = self.getTxtAttribute(contents, 'Title') author = self.getTxtAttribute(contents, 'Author') language = self.getTxtAttribute(contents, 'Language') doc.add( lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("language", language, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def index_review_request(self, writer, request): if lucene_is_2x: lucene_tokenized = lucene.Field.Index.TOKENIZED lucene_un_tokenized = lucene.Field.Index.UN_TOKENIZED elif lucene_is_3x: lucene_tokenized = lucene.Field.Index.ANALYZED lucene_un_tokenized = lucene.Field.Index.NOT_ANALYZED else: assert False # There are several fields we want to make available to users. # We index them individually, but also create a big hunk of text # to use for the default field, so people can just type in a # string and get results. doc = lucene.Document() doc.add(lucene.Field('id', str(request.id), lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add(lucene.Field('summary', request.summary, lucene.Field.Store.NO, lucene_tokenized)) if request.changenum: doc.add(lucene.Field('changenum', unicode(request.changenum), lucene.Field.Store.NO, lucene_tokenized)) # Remove commas, since lucene won't tokenize it right with them bugs = ' '.join(request.bugs_closed.split(',')) doc.add(lucene.Field('bug', bugs, lucene.Field.Store.NO, lucene_tokenized)) name = ' '.join([request.submitter.username, request.submitter.get_full_name()]) doc.add(lucene.Field('author', name, lucene.Field.Store.NO, lucene_tokenized)) doc.add(lucene.Field('username', request.submitter.username, lucene.Field.Store.NO, lucene_un_tokenized)) # FIXME: index reviews # FIXME: index dates files = [] if request.diffset_history: for diffset in request.diffset_history.diffsets.all(): for filediff in diffset.files.all(): if filediff.source_file: files.append(filediff.source_file) if filediff.dest_file: files.append(filediff.dest_file) aggregate_files = '\n'.join(set(files)) # FIXME: this tokenization doesn't let people search for files # in a really natural way. It'll split on '/' which handles the # majority case, but it'd be nice to be able to drill down # (main.cc, vmuiLinux/main.cc, and player/linux/main.cc) doc.add(lucene.Field('file', aggregate_files, lucene.Field.Store.NO, lucene_tokenized)) text = '\n'.join([request.summary, request.description, unicode(request.changenum), request.testing_done, bugs, name, aggregate_files]) doc.add(lucene.Field('text', text, lucene.Field.Store.NO, lucene_tokenized)) writer.addDocument(doc)
def UpdateIndex(self): # 인덱스를 최신 내용으로 갱신 writer = lucene.IndexWriter(self.indexDir, self.analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) try: # DB에서 내용 가져오기 for row in self.rows: doc = lucene.Document() doc.add( lucene.Field("bookUrl", row[0], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("thumbUrl", row[1], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("price", row[2], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("title", row[3], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("subTitle", row[4], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("author", row[5], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("publisher", row[6], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("publishDate", row[7], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("offcode", row[8], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) date = str(row[9]).split('-') date = ''.join(date) print 'regDate : ' + date + ' ' + str(type(date)) doc.add( lucene.Field("regDate", date, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) date = str(row[10]).split('-') date = ''.join(date) print 'updateDate : ' + date doc.add( lucene.Field("updateDate", date, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) except Exception, e: print "Failed in adding index : %s" % e exit(1)
def __makeUnStoredField(self, fieldname, fielddata): return lucene.Field(fieldname, fielddata, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED)
def __makeUnIndexedField(self, fieldname, fielddata): return lucene.Field(fieldname, fielddata, lucene.Field.Store.YES, lucene.Field.Index.NO)
def __makeKeywordField(self, fieldname, fielddata): return lucene.Field(fieldname, fielddata, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)
lucene.initVM() print("lucene version is:", lucene.VERSION) # Get the analyzer analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Get index storage store = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) # Get index writer writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED) try: # create a document that would we added to the index doc = lucene.Document() # Add a field to this document field = lucene.Field("titlendia", lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) # Add this field to the document doc.add(field) # Add the document to the index writer.addDocument(doc) except Exception as e: print("Failed in indexDocs:", e)
def indexDocs(self, root, writer): t = open('index.txt', "r") while True: line = t.readline() if line: try: line = line.strip().split() url = line[0] doc_name = line[1] print "adding", url path = os.path.join(root, doc_name) f = open(path) tmp = f.read() #str f.close() try: domain = get_tld(url) except: domain = "Unknown" print domain charset = (chardet.detect(tmp))['encoding'] if charset==None: charset = 'utf-8' #print charset #contents = unicode(tmp, charset) contents = tmp.decode(charset, 'ignore') soup = BeautifulSoup(contents) try: title = soup.title.text tmp = str(title).replace('\n','') title = tmp.decode('utf-8') except: title = "None" print title contents = soup.get_text() contents = " ".join(jieba.cut(contents)) #print contents doc = lucene.Document() doc.add(lucene.Field("name", doc_name, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("url", url, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("site", domain, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) if len(contents) > 0: doc.add(lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print "warning: no content in %s" % doc_name writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中 except Exception, e: print "Failed in indexDocs:", e else: break
def indexDocs(self, root, writer): t = open('index.txt', "r") while True: line = t.readline() if line: try: line = line.strip().split() url = line[0] doc_name = line[1] print "adding", url path = os.path.join(root, doc_name) f = open(path) tmp = f.read() #str f.close() contents = tmp.decode('gbk', 'ignore') soup = BeautifulSoup(contents) try: title = soup.title.text tmp = str(title).replace('\n','') title = tmp.decode('utf-8') except: title = "None" print title collection = [] #存放imgurl和对应的discription dic = {} p_box = soup.find(id='p-box') #处理一开始左上角一大图和下面几张小图 #print p_box.get('id','') sub_p_box = p_box.div.nextSibling.nextSibling.nextSibling.nextSibling #print sub_p_box.get('class','') #print sub_p_box big_pic = sub_p_box.div.div.div.img dic['imgurl'] = urlparse.urljoin(url, big_pic.get('src','')) dic['discription'] = big_pic.get('alt','') #print dic collection.append(dic) small_pic_group = big_pic.parent.nextSibling.nextSibling.div.ul for i in small_pic_group.findAll('li'): small_pic = i.img dic['imgurl'] = urlparse.urljoin(url, small_pic.get('src','')) dic['discription'] = " ".join(jieba.cut(small_pic.get('alt',''))) #print dic collection.append(dic) doc = lucene.Document() for i in collection: imgurl = i['imgurl'] discription = i['discription'] doc.add(lucene.Field("imgurl", imgurl, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("discription", discription, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("url", url, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("urltitle", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中 print "----------------------------------------------------" except Exception, e: print "Failed in indexDocs:", e else: break
lucene.initVM() indexDir = "D:/Downloads/index" dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = lucene.IndexWriter(dir_, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) print("Currently there are %d documents in the index..." % writer.numDocs()) content = ( "Strategische Konzeption, Umsetzung und Betreuung von langfristig " + "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.") doc = lucene.Document() doc.add( lucene.Field("content", content, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("filePath", "Projekte/bericht.txt", lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) content = ( "Design von Marken, Screens und Interfaces sowie Entwicklung von " + "individuellen Facebook Apps, iPhone Apps und Webauftritten.") doc = lucene.Document() doc.add( lucene.Field("content", content, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("filePath", "Projekte/implementierung.txt", lucene.Field.Store.YES, Field.Index.ANALYZED))