def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPositions(True) t2.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) file_path = root + 'r52-train-all-terms.txt' fd = open(file_path) contents = fd.readlines() fd.close() contents_list = [x.strip() for x in contents] for i in xrange(len(contents_list)): try: [topic, content] = contents_list[i].split('\t') doc = Document() doc.add(Field("id", str(i), t1)) doc.add(Field("topic", topic, t1)) doc.add(Field("contents", content, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def wikipedia_indexer(storage, wikipedia_file): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f): text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0: print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexsents(self, sentences, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for i, sent in enumerate(sentences): # print 'adding',i, sent try: root = os.getcwd() # contents = unicode(sent, 'iso-8859-1') doc = Document() doc.add(Field('name', str(i), t1)) doc.add(Field('path', root, t1)) if len(sent) > 0: doc.add(Field('contents', sent.lower(), t2)) else: print('warning: no content in %s' % str(i)) writer.addDocument(doc) except Exception as e: print('Failed in indexsents:', e) writer.commit() writer.close()
def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: if prm.top_tfidf > 0: words_idx = [] words, _ = utils.top_tfidf(txt.lower(), self.idf, prm.top_tfidf, prm.min_term_freq) if len(words) == 0: words.append('unk') for w in words: if w in self.vocab: words_idx.append(self.vocab[w]) else: words_idx.append(-1) # unknown words. else: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc)
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def index_image(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) with open(os.path.join(root, "index.txt"), mode="r", encoding="utf8") as index: count = 1 for line in index: print("\r", count, end="", sep="") try: image_url, content = line.strip().split()[:2] except ValueError as e: print(e) continue doc = Document() doc.add(Field("raw_content", content, t1)) content = " ".join( word for word in jieba.cut_for_search(content) if word.strip() and word not in self.stop_words) doc.add(Field("url", image_url, t1)) doc.add(Field("content", content, t2)) writer.addDocument(doc) count += 1 print("\n{count} image(s) added.".format(count=count))
def indexMovie(movie): doc = Document() doc.add(Field('id', str(movie), StringField.TYPE_STORED)) at_lest_one_field = False maybe_tags = movies_tags.query('item == @movie') if not maybe_tags.empty: tags = maybe_tags[['tags']].values.flatten()[0] doc.add(Field('tags', tags, TextField.TYPE_NOT_STORED)) at_lest_one_field = True maybe_description = movies_descriptions.query('item == @movie') if not maybe_description.empty: description = maybe_description[['description']].values.flatten()[0] doc.add(Field('description', description, TextField.TYPE_NOT_STORED)) at_lest_one_field = True maybe_genres = movies_genres.query('item == @movie') if not maybe_genres.empty: genres = maybe_genres[['genres']].values.flatten()[0] doc.add(Field('genres', genres, TextField.TYPE_NOT_STORED)) at_lest_one_field = True if at_lest_one_field: writer.addDocument(doc)
def addDoc(w, data): doc = Document() for field in data: value, type = data[field][0], data[field][1] ''' if type!='INTEGER_STORED': #print ('field=%s len=%d'%(field,len(value))) print ('field=%s value=%s'%(field,value)) else: print ('field=%s value=%d'%(field,value)) ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') try: w.addDocument(doc) except: #print ('error cat=%s'%(data['category'][0])) print('-----------------------------------') for field in data: value, type = data[field][0], data[field][1] print('field=%s\nvalue=%s' % (field, str(value)))
def indexer(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) def repalcer(text): chars = '\\`*_{}[]()>#+-.!$‘' for c in chars: if c in text: text = text.replace(c, ' ') return text for root, dirnames, filenames in os.walk(root): i = 0 for filename in filenames: i += 1 with open(os.path.join(root, filename)) as f: for line in f.readlines(): line = line.split(' ', 2) docname = line[0] + ' ' + line[1] name = repalcer(line[0]) contents = line[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): # traverse through the doc directory for filename in filenames: # if not filename.endswith('.cdc'): # continue try: # only add the filename and path for indexing path = os.path.join(root, filename) print "adding file : ", path file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in ", filename writer.addDocument(doc) except Exception, e: print "failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) inFile = open(str(args["inputFile"])) indexName = inFile.readline() while (indexName != ''): print "adding", indexName doc = Document() doc.add(Field("name", indexName, t1)) #doc.add(Field("path", root, t1)) text = inFile.readline() if (len(text) > 0): print("contents: %s\n" % text) doc.add(Field("contents", text, t2)) else: print "warning: no content in %s" % indexName indexName = inFile.readline() writer.addDocument(doc) inFile.close()
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt', 'r', encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url, title, alt] f.close() for src in picDict: doc = Document() doc.add( Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % ( n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def build(self, index): writer = self.getWriter(directory=index.index, analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT)) seed(101) for d in xrange(self.minId, self.maxId + 1): doc = Document() doc.add(Field("id", self.pad(d), StringField.TYPE_STORED)) if index.allowNegativeRandomInts: r = randint(~self.MAX_INT, self.MAX_INT) else: r = randint(0, self.MAX_INT) if index.maxR < r: index.maxR = r if r < index.minR: index.minR = r doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED)) doc.add(Field("body", "body", StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close()
def Indexing(self, writer): print("Indexing Segmented File [", SEGMENTATION_FILE, "]") with open(SEGMENTATION_FILE, 'r') as f: line_count = 0 for line in f: # 建立 context 的 fieldtype,需要搭建索引、存储、向量化 fieldtype_context = FieldType() fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS) fieldtype_context.setStored(True) fieldtype_context.setTokenized(True) # 建立 phrase 的 fieldtype,只需要保存 fieldtype_phrase = FieldType() fieldtype_phrase.setStored(True) # 对分词好的内容进行处理,把词语和词性分开来存储 processed_context, processed_phrase = self.process_line(line) doc = Document() # context field是用于记录文章的内容 doc.add(Field('context', processed_context, fieldtype_context)) # phrase field适用于记录文章每个词所对应的词性 doc.add(Field('phrase', processed_phrase, fieldtype_phrase)) # 把document写入索引库 writer.addDocument(doc) # 跟踪程序运行情况用 print("\r", str(line_count), " lines", end="", flush=True) line_count = line_count + 1 if line_count > self.index_limit and not self.training: break writer.close() print()
def setUp(self): super(PyLuceneThreadTestCase, self).setUp() self.classLoader = Thread.currentThread().getContextClassLoader() writer = self.getWriter(analyzer=StandardAnalyzer()) doc1 = Document() doc2 = Document() doc3 = Document() doc4 = Document() doc1.add(Field("field", "one", TextField.TYPE_STORED)) doc2.add(Field("field", "two", TextField.TYPE_STORED)) doc3.add(Field("field", "three", TextField.TYPE_STORED)) doc4.add(Field("field", "one", TextField.TYPE_STORED)) writer.addDocument(doc1) writer.addDocument(doc2) writer.addDocument(doc3) writer.addDocument(doc4) writer.commit() writer.close() self.testData = [('one', 2), ('two', 1), ('three', 1), ('five', 0)] * 500 self.lock = threading.Lock() self.totalQueries = 0
def build_index(self, dict_data): print("loading data...") t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for k, v in dict_data.items(): doc = Document() doc.add(Field("id", k, t1)) doc.add(Field("content", v, t2)) self.writer.addDocument(doc) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() self.writer.commit() self.writer.close() ticker.tick = False print("done")
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'gbk') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexsents(self, sentences, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for i, sent in enumerate(sentences): #print "adding",i, sent try: root = os.getcwd() #contents = unicode(sent, 'iso-8859-1') doc = Document() doc.add(Field("name", str(i), t1)) doc.add(Field("path", root, t1)) if len(sent) > 0: doc.add(Field("contents", sent.lower(), t2)) else: print "warning: no content in %s" % str(i) writer.addDocument(doc) except Exception, e: print "Failed in indexsents:", e
def get_document(fname, split_by=None): docs = [] _name = os.path.split(fname)[-1] with open(fname) as f: contents = f.read() if split_by: paragraphs = contents.split(split_by) for ix, par in enumerate(paragraphs): if not par: continue doc = Document() name = "{}_{}".format(_name, ix) doc.add( Field('filename', name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', par, Field.Store.YES, Field.Index.ANALYZED)) docs.append(doc) else: doc = Document() doc.add( Field('filename', _name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', contents, Field.Store.YES, Field.Index.ANALYZED)) docs.append(doc) return docs
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) index_file = open("index.txt", 'r') for line in index_file.readlines(): try: src = line.strip().split('\t')[0] filename = line.strip().split('\t')[1] tag = line.strip().split('\t')[2] path = os.path.join(root, filename) doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) doc.add(Field("src", src, t1)) if len(tag) > 0: doc.add(Field("tag", tag, t2)) else: print "warning: no tag in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def setUp(self): super(Test_Bug1763, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.d1 = RAMDirectory() self.d2 = RAMDirectory() w1, w2 = [ self.getWriter(directory=d, analyzer=self.analyzer) for d in [self.d1, self.d2] ] doc1 = Document() doc2 = Document() doc1.add( Field("all", "blah blah double blah Gesundheit", TextField.TYPE_NOT_STORED)) doc1.add(Field('id', '1', StoredField.TYPE)) doc2.add( Field("all", "a quick brown test ran over the lazy data", TextField.TYPE_NOT_STORED)) doc2.add(Field('id', '2', StoredField.TYPE)) w1.addDocument(doc1) w2.addDocument(doc2) for w in [w1, w2]: w.close()
def indexDocs(self, sourceDir, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for sourceDir, dirnames, filenames in os.walk(sourceDir): for filename in filenames: if not filename.endswith('.txt'): continue print(filename) try: path = os.path.join(sourceDir, filename) file = open(path, 'r', encoding="utf-8") contents = file.read() #contents = str(filecontent, 'utf-8') #contents = filecontent.encode('utf-8') #print('path', path, len(contents)) doc = Document() doc.add(Field("name", filename, t1)) # filename (title) #doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field(queryField, contents, t2)) # content else: print("warning: no content in %s" % filename) writer.addDocument(doc) file.close() except NameError: print("Failed in indexDocs:")
def addDoc(w, data): doc = Document() #print ('----------------------------') for field in data: value, type = data[field][0], data[field][1] ''' print ('field:%s type:%s'%(field,type)) print (value+'\n') ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_DF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF)) elif type == 'CUSTOM_FIELD_TEXT_BF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') w.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t3 = FieldType() t3.setIndexed(True) t3.setStored(False) t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格 t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) total=0 file = open(root,"r") for line in file.readlines(): try: imgurl, itemurl, content = line.split('\t') total+=1 print total print "adding", content contents = ' '.join(jieba.cut(content)) doc = Document() doc.add(Field("imgurl", imgurl, t1)) doc.add(Field("itemurl", itemurl, t1)) doc.add(Field("title", content, t1)) doc.add(Field("contents",contents,t3)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDictionary(d, writer): for k, v in d.iteritems(): doc = Document() doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) return writer.numDocs()
def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info): """ Add a document to the index. Does this using direct Lucene access. :param new_doc: dict of fields with values :type new_doc:dict :param metadata: ditto :type metadata:dict :param fields_to_process: only add these fields from the doc dict :type fields_to_process:list """ doc = Document() total_numTerms=bow_info["total_numterms"] # each BOW now comes with its field for field in fields_to_process: field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES) ## boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) field_object.setBoost(float(boost)) doc.add(field_object) json_metadata=json.dumps(metadata) doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO)) doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO)) doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def index_docs(root, writer): # metadata: name and path metadata = FieldType() metadata.setStored(True) # as is value metadata.setTokenized(False) metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # content: abstract and body content_type = FieldType() content_type.setStored(True) # to highlight on search results content_type.setTokenized(True) # tokenize words content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for directory, _, file_names in walk(root): for file_name in file_names: name, extension = splitext(file_name) if extension not in DOC_FORMATS: continue # skip unsupported formats file_path = join(directory, file_name) print ' ', file_path # Build indexed document doc = Document() doc.add(Field('name', file_name, metadata)) doc.add(Field('path', directory, metadata)) # Read file contents content = process(file_path, 'utf-8', method='pdfminer') abstract = extract_abstract(content) doc.add(Field('content', content, content_type)) doc.add(Field('abstract', abstract, content_type)) writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) wikiFile = ZipFile(root, 'r') files = wikiFile.namelist() i = 0 for file in files[1:]: i += 1 wiki = wikiFile.open(file, 'r') for line in wiki: for line in codecs.iterdecode(wiki, 'utf8'): normailized = unicodedata.normalize('NFD', line).split(' ', 2) if not normailized[1].isdigit(): continue docname = normailized[0] + ' ' + normailized[1] name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0]) contents = normailized[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i, file)
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def __init__(self, path): lazyImport() self._writer, self._reader, self._searcher = self._getLucene(path) self._latestModifications = {} self._doc = Document() self._keyField = StringField("key", "", Field.Store.NO) self._valueField = Field("value", "", UNINDEXED_TYPE) self._doc.add(self._keyField) self._doc.add(self._valueField)
def add_article(self, article): # constructing a document doc = Document() title = Field('title', article.title, Field.Store.YES, Field.Index.ANALYZED) title.setBoost(10.0) doc.add(title) description = Field('description', article.description, Field.Store.YES, Field.Index.ANALYZED) description.setBoost(5.0) doc.add(description) doc.add(Field('keywords', article.keywords, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', article.content, Field.Store.YES, Field.Index.ANALYZED)) if article.date: doc.add(Field('date', article.date, Field.Store.YES, Field.Index.NOT_ANALYZED)) if article.last_modified: doc.add(Field('last_modified', article.last_modified, Field.Store.YES, Field.Index.NOT_ANALYZED)) if article.images: doc.add(Field('image_url', article.images[0][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('image_text', article.images[0][1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field('url', article.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) # creates document or updates if already exists self.writer.updateDocument(Term("url", article.url), doc)
def testDocBoost(self): writer = self.getWriter(analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT)) f1 = Field("field", "word", TextField.TYPE_STORED) f2 = Field("field", "word", TextField.TYPE_STORED) f2.setBoost(2.0) d1 = Document() d2 = Document() d1.add(f1) # boost = 1 d2.add(f2) # boost = 2 writer.addDocument(d1) writer.addDocument(d2) writer.close() scores = [0.0] * 2 class collector(PythonCollector): def __init__(_self, scores): super(collector, _self).__init__() _self.scores = scores _self.base = 0 def collect(_self, doc, score): _self.scores[doc + _self.base] = score def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True self.getSearcher().search(TermQuery(Term("field", "word")), collector(scores)) lastScore = 0.0 for score in scores: self.assert_(score > lastScore) lastScore = score
def create_index(index) : indexDir = SimpleFSDirectory(File(index)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open('f:/nlp/data/questions/combine.txt') for line in f : line = get_data_from_text(line.decode('utf-8')) doc = Document() field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED) field.setBoost(2.0) doc.add(field) writer.addDocument(doc) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def survey(request): ipAddr = get_client_ip(request) instances = (Classes.objects.values_list('image_class_desc')) instances = [i[0] for i in instances] #cnt = len(instances) #lets get out choice location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) try: #image_class = image.objects.get(pk=request.POST['survey']) s = request.POST['survey']#get from post except (KeyError, Classes.DoesNotExist): return render(request, 'web/index.html',{ 'error_message': "You didn't select a choice.", }) else: image_class = instances[int(s)] docNum = request.POST['imageID']#get document id doc = reader.document(int(docNum)) fname = doc.get("filename") print(fname) #SimpleFSDirectory(File(location)).clearLock(IndexWriter.WRITE_LOCK_NAME); fileClassField = doc.get("Classification") if str(fileClassField) == "None":#check if the field exists####NEED TO CHECK THIS fileClassField = str(ipAddr + ":" + image_class)#I think we must add an ip address to this else: fileClassField = str(ipAddr + ":" + fileClassField) + ", " + image_class #doc.removeField("Classification") #doc.add(StringField("Classification", fileClassField, Field.Store.YES)) #t = doc.get("Classification") #reader.close() indexDir = SimpleFSDirectory(File(location)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) fields = doc.getFields()#get all fields doc2 = Document() classificationFieldFlag = False for f in fields: field = Field.cast_(f) (k, v) = field.name(), field.stringValue() if k == "Classification": classificationFieldFlag = True field = StringField("Classification", fileClassField, Field.Store.YES) doc2.add(field) else: doc2.add(field) if classificationFieldFlag == False:#this does not exist in the document must add doc2.add(StringField("Classification", fileClassField, Field.Store.YES)) # doc2.add(StringField("Classification", fileClassField, Field.Store.YES)) # doc2.add(StringField("fid", doc.get("fid"), Field.Store.YES)) # doc2.add(StringField("articleid", doc.get("articleid"), Field.Store.YES)) # doc2.add(StringField("caption", doc.get("caption"), Field.Store.YES)) # doc2.add(StringField("figureid", doc.get("figureid"), Field.Store.YES)) # doc2.add(StringField("filename", doc.get("filename"), Field.Store.YES)) # doc2.add(StringField("filepath", doc.get("filepath"), Field.Store.YES)) # doc2.add(StringField("label", doc.get("label"), Field.Store.YES)) #writer.updateDocument(Term("fid","f000000000023"), doc2)#If field exists update writer.updateDocument(Term("fid", doc.get("fid")), doc2)#If field exists update writer.commit(); #writer.optimize() writer.close() #writer.unlock(SimpleFSDirectory(File(location))) return HttpResponseRedirect(reverse('web:index', args=()))
def _getIndex(self, even, odd): mergePolicy = LogDocMergePolicy() mergePolicy.setMergeFactor(1000) directory = RAMDirectory() self.dirs.append(directory) writer = self.getWriter(directory=directory, analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT), maxBufferedDocs=2, mergePolicy=mergePolicy) if self.dvStringSorted: # Index sorted stringDVType = FieldInfo.DocValuesType.SORTED elif self.notSorted: # Index non-sorted stringDVType = FieldInfo.DocValuesType.BINARY else: # sorted anyway stringDVType = FieldInfo.DocValuesType.SORTED ft1 = FieldType() ft1.setStored(True) ft2 = FieldType() ft2.setIndexed(True) for i in xrange(len(self.data)): if (i % 2 == 0 and even) or (i % 2 == 1 and odd): doc = Document() doc.add(Field("tracer", self.data[i][0], ft1)) doc.add(TextField("contents", self.data[i][1], Field.Store.NO)) if self.data[i][2] is not None: doc.add(StringField("int", self.data[i][2], Field.Store.NO)) if self.supportsDocValues: doc.add(NumericDocValuesField("int_dv", Long.parseLong(self.data[i][2]))) if self.data[i][3] is not None: doc.add(StringField("float", self.data[i][3], Field.Store.NO)) if self.supportsDocValues: doc.add(FloatDocValuesField("float_dv", Float.parseFloat(self.data[i][3]))) if self.data[i][4] is not None: doc.add(StringField("string", self.data[i][4], Field.Store.NO)) if self.supportsDocValues: if stringDVType == FieldInfo.DocValuesType.SORTED: doc.add(SortedDocValuesField("string_dv", BytesRef(self.data[i][4]))) elif stringDVType == FieldInfo.DocValuesType.BINARY: doc.add(BinaryDocValuesField("string_dv", BytesRef(self.data[i][4]))) else: raise ValueError("unknown type " + stringDVType) if self.data[i][5] is not None: doc.add(StringField("custom", self.data[i][5], Field.Store.NO)) if self.data[i][6] is not None: doc.add(StringField("i18n", self.data[i][6], Field.Store.NO)) if self.data[i][7] is not None: doc.add(StringField("long", self.data[i][7], Field.Store.NO)) if self.data[i][8] is not None: doc.add(StringField("double", self.data[i][8], Field.Store.NO)) if self.supportsDocValues: doc.add(NumericDocValuesField("double_dv", Double.doubleToRawLongBits(Double.parseDouble(self.data[i][8])))) if self.data[i][9] is not None: doc.add(StringField("short", self.data[i][9], Field.Store.NO)) if self.data[i][10] is not None: doc.add(StringField("byte", self.data[i][10], Field.Store.NO)) if self.data[i][11] is not None: doc.add(StringField("parser", self.data[i][11], Field.Store.NO)) for f in doc.getFields(): if f.fieldType().indexed() and not f.fieldType().omitNorms(): Field.cast_(f).setBoost(2.0) writer.addDocument(doc) reader = writer.getReader() writer.close() return self.getSearcher(reader=reader)
def _getFullStrings(self): mergePolicy = LogDocMergePolicy() mergePolicy.setMergeFactor(97) directory = RAMDirectory() self.dirs.append(directory) writer = self.getWriter(directory=directory, analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT), maxBufferedDocs=4, mergePolicy=mergePolicy) onlyStored = FieldType() onlyStored.setStored(True) fixedLen = self.getRandomNumber(2, 8) fixedLen2 = self.getRandomNumber(1, 4) for i in xrange(NUM_STRINGS): doc = Document() num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52) doc.add(Field("tracer", num, onlyStored)) doc.add(StringField("string", num, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string_dv", BytesRef(num))) else: doc.add(BinaryDocValuesField("string_dv", BytesRef(num))) num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50) doc.add(StringField("string2", num2, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string2_dv", BytesRef(num2))) else: doc.add(BinaryDocValuesField("string2_dv", BytesRef(num2))) doc.add(Field("tracer2", num2, onlyStored)) for f2 in doc.getFields(): if f2.fieldType().indexed() and not f2.fieldType().omitNorms(): Field.cast_(f2).setBoost(2.0) numFixed = self.getRandomCharString(fixedLen, 48, 52) doc.add(Field("fixed_tracer", numFixed, onlyStored)) doc.add(StringField("string_fixed", numFixed, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string_fixed_dv", BytesRef(numFixed))) else: doc.add(BinaryDocValuesField("string_fixed_dv", BytesRef(numFixed))) num2Fixed = self.getRandomCharString(fixedLen2, 48, 52) doc.add(StringField("string2_fixed", num2Fixed, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string2_fixed_dv", BytesRef(num2Fixed))) else: doc.add(BinaryDocValuesField("string2_fixed_dv", BytesRef(num2Fixed))) doc.add(Field("tracer2_fixed", num2Fixed, onlyStored)) for f2 in doc.getFields(): if f2.fieldType().indexed() and not f2.fieldType().omitNorms(): Field.cast_(f2).setBoost(2.0) writer.addDocument(doc) writer.close() return self.getSearcher(directory=directory)
def reindex(self): ''' Re-indexes the entire database into Index file''' start = time.time() # get all posts posts = self._tuples_to_dict(self._fetch_all_questions(), self._posts_fields) if not posts: raise Exception("FATAL Error: Could not fetch posts from Database") # open indexer # lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION store = SimpleFSDirectory(File(self.index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) indexedField = FieldType() indexedField.setIndexed(True) indexedField.setStored(True) indexedField.setTokenized(True) indexedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) storedField = FieldType() storedField.setIndexed(False) storedField.setStored(True) storedField.setTokenized(False) storedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) fieldTypes = { 'type' : storedField, 'id' : storedField, 'title' : indexedField, 'question' : indexedField, 'answer' : indexedField, # 'comment' : indexedField, 'tag' : indexedField, 'extra' : indexedField, } # get their comments num_docs = 0 for post in posts: if self.status_mode: print "\r {0:.2f} %complete".format(((num_docs/142627.0)*100)), if self.debug : print "\n","*"*20,"\nIndexing post: ", post['id'], "from ", post['extra'] if self.debug and self.verbose_values: print post answers = self._tuples_to_dict(self._fetch_all_answers(post['id'], post['extra']), self._answer_fields) # add comment field for answer in answers: num_docs += 1 if self.debug: print "\n","+"*10, "\nMaking new Document" doc = Document() if self.debug: print "Adding doc type" doc.add(Field("type", self.doctype, fieldTypes['type'])) # make fields if self.debug: print "Adding post fields" for i in xrange(len(self._posts_fields)): f = Field(self._posts_fields[i], self._cleanup_tag(post[self._posts_fields[i]]), fieldTypes[self._posts_fields[i]]) f.setBoost(self._fields_boost[self._posts_fields[i]]) doc.add(f) if self.status_mode: print "\t Indexing answer: ", answer['answer_id'] if self.debug and self.verbose_values: print answer # answered_doc = copy.deepcopy(doc) # make comment field f = Field("answer", self._cleanup_tag(answer['answer']), fieldTypes['answer']) f.setBoost(self._fields_boost['answer']) doc.add(f) # calculate paths # commented_doc = copy.deepcopy(answered_doc) # comments = self._comments_to_comment_string(self._tuples_to_dict(self._fetch_all_comments(answer['id']), self._comment_fields)) # if self.debug: print "\t\tAdding comments: ", comments # commented_doc.add(Field("comment", self._cleanup_tag(comments), fieldTypes['comment'])) # write index if self.debug: print "\tAdding document {doc_id} to index".format(doc_id=post['id']) writer.addDocument(doc) # del answered_doc # del commented_doc if self.debug: print "Commiting document to index" writer.commit() # close index if self.status_mode: print "Closing index write" writer.close() end = time.time() - start if self.status_mode: print "\n","-"*20, \ "\nTotal time spent in indexing: ", end, "seconds" \ "\nIndexed {num_docs} documents".format(num_docs=num_docs)
class LuceneKeyValueStore(object): def __init__(self, path): lazyImport() self._writer, self._reader, self._searcher = self._getLucene(path) self._latestModifications = {} self._doc = Document() self._keyField = StringField("key", "", Field.Store.NO) self._valueField = Field("value", "", UNINDEXED_TYPE) self._doc.add(self._keyField) self._doc.add(self._valueField) def get(self, key, default=None): try: return self[key] except KeyError: return default def __setitem__(self, key, value): key = str(key) value = str(value) self._maybeReopen() self._keyField.setStringValue(key) self._valueField.setStringValue(value) self._writer.updateDocument(Term("key", key), self._doc) self._latestModifications[key] = value def __getitem__(self, key): key = str(key) value = self._latestModifications.get(key) if value is DELETED_RECORD: raise KeyError(key) if not value is None: return value self._maybeReopen() topDocs = self._searcher.search(TermQuery(Term("key", key)), 1) if topDocs.totalHits == 0: raise KeyError(key) return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value") def __delitem__(self, key): key = str(key) self._writer.deleteDocuments(Term("key", key)) self._latestModifications[key] = DELETED_RECORD def __len__(self): raise NotImplementedError def __iter__(self): raise NotImplementedError def items(self): raise NotImplementedError def keys(self): raise NotImplementedError def values(self): raise NotImplementedError def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher def _maybeReopen(self): if len(self._latestModifications) > 10000: newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True) if not newReader is None: self._reader.close() self._reader = newReader self._searcher = IndexSearcher(self._reader) self._latestModifications.clear() def commit(self): self._writer.commit() def close(self): self._writer.close()
def indexTable(self, writer): #connection con = None #define the index of all the fields #---------step 2:connect to mysql---------- con = mdb.connect('localhost','root','testgce','douban_movie_v3') #t_num = FieldType.NumericType it is wrong!! t_num = FieldType() t_num.setStored(False) t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) maxDict = utils.maxDict #加权数值范围 base = DOC_BOOST_RANGE[0] upper = DOC_BOOST_RANGE[1] with con: # Careful with codecs con.set_character_set('utf8') cur = con.cursor() # Aagin the codecs cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') #------step 3: choose the right table------ cur.execute("SELECT * FROM movie_items") numrows = int(cur.rowcount) print 'numrows:',numrows for i in range(numrows): print row = cur.fetchone() #------step 4:Index your field------ summary = row[SUMMARY] subject_id = row[SUBJECT_ID] print 'id'+subject_id year = utils.formatYear(row[YEAR]) try: date = DateTools.stringToDate(year.replace('-',' ')) wtfFile = open('wtf.txt','a') dateStr = DateTools.dateToString(date,DateTools.Resolution.DAY) except: wtfFile.write(year+'\n') doc = Document() #boosting boostProb = utils.calcBoostProb(row,maxDict,dateStr) boost = base + boostProb*(upper-base) doc.add(FloatField("boost",boost,Field.Store.YES)) doc.add(StringField("year",dateStr,Field.Store.YES)) print 'dateStr:'+dateStr #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.) do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0 wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0 #fields which should not be analyzed doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES)) doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES)) doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES)) #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost)) doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES)) doc.add(IntField("do_count", int(do_count), Field.Store.YES)) doc.add(IntField("wish_count", int(wish_count), Field.Store.YES)) doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES)) doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES)) doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES)) doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES)) #fields which should be analyzed with WhitespaceAnalyzer #attention!!! dont use a long sentence like : #doc.add(Field("genres", row[GENRES].replace(delim,' '), t3).setBoost(boost)) #or you'll get a null pointer error f = Field("countries", row[COUNTRIES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #process casts raw_casts = row[CASTS].replace(delim,' ') f = Field("raw_casts", raw_casts , t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · raw_casts = raw_casts.replace('·',' ') if len(raw_casts.split(' '))<CASTS_LEN: #平局人名长度是4 casts = raw_casts + ' ¥¥¥¥'*(CASTS_LEN-len(raw_casts.split(' '))) f = Field("casts", casts , t3) f.setBoost(boost) doc.add(f) #process directors raw_directors = row[DIRECTORS].replace(delim,' ') f = Field("raw_directors",raw_directors, t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · 替换 raw_directors = raw_directors.replace('·',' ') if len(raw_directors.split(' '))<DIRECTORS_LEN: #平局人名长度是4 directors = raw_directors + ' ¥¥¥¥'*(DIRECTORS_LEN-len(raw_directors.split(' '))) f = Field("directors", directors, t3) f.setBoost(boost) doc.add(f) Field("genres", row[GENRES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) Field("subtype", row[SUBTYPE].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #it is wrong cause indexable field has no method setBoost # fieldList = doc.getFields() # is not a python 'list' , but a 'List' which is unindexable # for eachField in fieldList: # eachField.setBoost(boost) #user_tags 原始字符串要存,reRank要用: doc.add(StringField("raw_user_tags",row[USER_TAGS],Field.Store.YES)) doc.add(StringField("raw_others_like",row[OTHERS_LIKE],Field.Store.YES)) user_tags_str = '' others_like_str = '' tags_len = 0 if row[USER_TAGS]!='': user_tags_list = row[USER_TAGS].split(delim) for tag_pair in user_tags_list: if tag_pair!='':#字符串的最后一个字符是¥,这样split之后最后一个元素是空字符 #print 'tag_pair'+tag_pair+'hhe' tag_name = tag_pair.split(delim_uo)[0]+' ' # dont forget this space !! tag_num = tag_pair.split(delim_uo)[1] tag_num_processed = int(int(tag_num)/TAG_SPAN)+1 #最小为1 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! user_tags_str = user_tags_str +' '+ tag_name * tag_num_processed tags_len = tags_len + tag_num_processed #最后得到总共词的个数 if tags_len<TAGS_AVER_LEN: #填充tags,目测3是平均长度,所以使用 ¥¥¥ user_tags_str = user_tags_str +' ¥¥¥'*(TAGS_AVER_LEN - tags_len) # if row[OTHERS_LIKE]!='': for like_pair in row[OTHERS_LIKE].split(delim): if like_pair!='': others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1] #start process adjs if row[ADJS] != None: raw_adjs = row[ADJS][:-1] adjs_str = '' adjs_len = 0 if row[ADJS] != '' and row[ADJS] != '\n': #'重要=4.0,特殊=4.0' adjs_str = row[ADJS] adjs_list = adjs_str.split(',') for adj_pair in adjs_list: #print 'adj_pair:'+adj_pair+'hhe' adj_name = adj_pair.split('=')[0] adj_num = adj_pair.split('=')[1] #去换行符,转换int if adj_num[-1] == '\n': adj_num = adj_num[0:-1] adj_num = int(float(adj_num)) add_adj='' # #同义词 # adj_name_bro = searchDictValue(adjMap,adj_name) # if adj_name_bro == -1: #表示没有结果,即未找到近义词,不添加 # add_adj = '' # else: # add_adj = (adj_name_bro+' ')*adj_num # raw_adjs = raw_adjs + ',' + adj_name_bro+'='+str(adj_num) adjs_str = adjs_str + ' ' + (adj_name+' ') * adj_num +add_adj adjs_len = adjs_len + adj_num #最后得到总共tags的个数 #print raw_adjs doc.add(StringField("raw_adjs",raw_adjs,Field.Store.YES)) if adjs_len<ADJS_AVER_LEN: #填充 adjs_str,目测2是平均长度,所以使用 "¥¥" adjs_str = adjs_str +' ¥¥'*(ADJS_AVER_LEN - adjs_len) f = Field("adjs", adjs_str, t3) f.setBoost(boost) doc.add(f) f = Field("user_tags", user_tags_str, t3) f.setBoost(boost) doc.add(f) f = Field("others_like", others_like_str, t3) f.setBoost(boost) doc.add(f) #fields which should be analyzed with good analyzer f = Field("title", row[TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("original_title", row[ORIGINAL_TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t3) f.setBoost(boost) doc.add(f) f = Field("aka", row[AKA], t2) f.setBoost(boost) doc.add(f) if len(summary) > 0: print subject_id +'--->'+':\n '+ row[TITLE] try: summary_unicoded = unicode(summary, 'utf-8') #test the encoding except Exception,e: print "Decode Failed: ", e f = Field('summary', summary, t2) f.setBoost(boost) doc.add(f) else: print "warning:\n" + subject_id +'---> No content!' print 'boosting:' + str(boost) #for debug if boost>upper: print boostProb print maxDict exit(0) writer.addDocument(doc)
def testSimple(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) doc = Document() field = Field("foo", "", TextField.TYPE_NOT_STORED) doc.add(field) dvField = FloatDocValuesField("foo_boost", 0.0) doc.add(dvField) field2 = Field("bar", "", TextField.TYPE_NOT_STORED) doc.add(field2) field.setStringValue("quick brown fox") field2.setStringValue("quick brown fox") dvField.setFloatValue(2.0) # boost x2 writer.addDocument(doc) field.setStringValue("jumps over lazy brown dog") field2.setStringValue("jumps over lazy brown dog") dvField.setFloatValue(4.0) # boost x4 writer.addDocument(doc) reader = writer.getReader() writer.close() # no boosting searcher1 = self.getSearcher(reader=reader) base = searcher1.getSimilarity(True) # boosting searcher2 = self.getSearcher(reader=reader) class _similarity(PythonPerFieldSimilarityWrapper): def __init__(_self, base): super(_similarity, _self).__init__() _self.base = base _self.fooSim = BoostingSimilarity(base, "foo_boost") def get(_self, field): return _self.fooSim if "foo" == field else _self.base searcher2.setSimilarity(_similarity(base)) # in this case, we searched on field "foo". first document should have # 2x the score. tq = TermQuery(Term("foo", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON) # this query matches only the second document, which should have 4x # the score. tq = TermQuery(Term("foo", "jumps")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON) # search on on field bar just for kicks, nothing should happen, since # we setup our sim provider to only use foo_boost for field foo. tq = TermQuery(Term("bar", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON) reader.close()