def __init__(self, folder=None, fields=[], similarity="tfidf"): self.jcc = lucene.initVM() if folder: self.directory = SimpleFSDirectory(File(folder)) else: self.directory = RAMDirectory() self.fields = {} for field in fields: ft = FieldType() for pname, pvalue in field.props.items(): setter = getattr(ft, "set" + pname.capitalize()) setter(pvalue) ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # ft.setOmitNorms(True) self.fields[field.name] = ft self.similarity = similarity.lower() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.writer = None self.searcher = None
def indexsents(self, sentences, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for i, sent in enumerate(sentences): # print 'adding',i, sent try: root = os.getcwd() # contents = unicode(sent, 'iso-8859-1') doc = Document() doc.add(Field('name', str(i), t1)) doc.add(Field('path', root, t1)) if len(sent) > 0: doc.add(Field('contents', sent.lower(), t2)) else: print('warning: no content in %s' % str(i)) writer.addDocument(doc) except Exception as e: print('Failed in indexsents:', e) writer.commit() writer.close()
def indexDocs(self, root, writer): t1 = FieldType() # for short items, e.g. file name. t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS t2 = FieldType() # for content t2.setIndexed(True) t2.setStored(False) # don't store the original text t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): # traverse through the doc directory for filename in filenames: # if not filename.endswith('.cdc'): # continue try: # only add the filename and path for indexing path = os.path.join(root, filename) print "adding file : ", path file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in ", filename writer.addDocument(doc) except Exception, e: print "failed in indexDocs:", e
def index_image(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) with open(os.path.join(root, "index.txt"), mode="r", encoding="utf8") as index: count = 1 for line in index: print("\r", count, end="", sep="") try: image_url, content = line.strip().split()[:2] except ValueError as e: print(e) continue doc = Document() doc.add(Field("raw_content", content, t1)) content = " ".join( word for word in jieba.cut_for_search(content) if word.strip() and word not in self.stop_words) doc.add(Field("url", image_url, t1)) doc.add(Field("content", content, t2)) writer.addDocument(doc) count += 1 print("\n{count} image(s) added.".format(count=count))
def Indexing(self, writer): print("Indexing Segmented File [", SEGMENTATION_FILE, "]") with open(SEGMENTATION_FILE, 'r') as f: line_count = 0 for line in f: # 建立 context 的 fieldtype,需要搭建索引、存储、向量化 fieldtype_context = FieldType() fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS) fieldtype_context.setStored(True) fieldtype_context.setTokenized(True) # 建立 phrase 的 fieldtype,只需要保存 fieldtype_phrase = FieldType() fieldtype_phrase.setStored(True) # 对分词好的内容进行处理,把词语和词性分开来存储 processed_context, processed_phrase = self.process_line(line) doc = Document() # context field是用于记录文章的内容 doc.add(Field('context', processed_context, fieldtype_context)) # phrase field适用于记录文章每个词所对应的词性 doc.add(Field('phrase', processed_phrase, fieldtype_phrase)) # 把document写入索引库 writer.addDocument(doc) # 跟踪程序运行情况用 print("\r", str(line_count), " lines", end="", flush=True) line_count = line_count + 1 if line_count > self.index_limit and not self.training: break writer.close() print()
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'gbk') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) index_file = open("index.txt", 'r') for line in index_file.readlines(): try: src = line.strip().split('\t')[0] filename = line.strip().split('\t')[1] tag = line.strip().split('\t')[2] path = os.path.join(root, filename) doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) doc.add(Field("src", src, t1)) if len(tag) > 0: doc.add(Field("tag", tag, t2)) else: print "warning: no tag in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPositions(True) t2.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) file_path = root + 'r52-train-all-terms.txt' fd = open(file_path) contents = fd.readlines() fd.close() contents_list = [x.strip() for x in contents] for i in xrange(len(contents_list)): try: [topic, content] = contents_list[i].split('\t') doc = Document() doc.add(Field("id", str(i), t1)) doc.add(Field("topic", topic, t1)) doc.add(Field("contents", content, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexsents(self, sentences, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for i, sent in enumerate(sentences): #print "adding",i, sent try: root = os.getcwd() #contents = unicode(sent, 'iso-8859-1') doc = Document() doc.add(Field("name", str(i), t1)) doc.add(Field("path", root, t1)) if len(sent) > 0: doc.add(Field("contents", sent.lower(), t2)) else: print "warning: no content in %s" % str(i) writer.addDocument(doc) except Exception, e: print "Failed in indexsents:", e
def indexDocs(self, root, iw): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for filename in os.listdir(root): if not filename.endswith(".txt"): print("file is not a txt file. we skip it.") continue print("adding", filename) path = os.path.join(root, filename) self.parseBook(path, t1, t2, iw) # Prints a set of statistics displaying missing data # Authorerror = number of authors not found # Titleerror = number of titles not found # Documenterror = number of documents where text could not be extracted so entire document was indexed print("AuthorError: {}".format(self.authorcount)) print("TitleError: {}".format(self.titlecount)) print("DocumentError: {}".format(self.errorcount)) iw.close()
def indexer(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) def repalcer(text): chars = '\\`*_{}[]()>#+-.!$‘' for c in chars: if c in text: text = text.replace(c, ' ') return text for root, dirnames, filenames in os.walk(root): i = 0 for filename in filenames: i += 1 with open(os.path.join(root, filename)) as f: for line in f.readlines(): line = line.split(' ', 2) docname = line[0] + ' ' + line[1] name = repalcer(line[0]) contents = line[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i)
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def index_docs(root, writer): # metadata: name and path metadata = FieldType() metadata.setStored(True) # as is value metadata.setTokenized(False) metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # content: abstract and body content_type = FieldType() content_type.setStored(True) # to highlight on search results content_type.setTokenized(True) # tokenize words content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for directory, _, file_names in walk(root): for file_name in file_names: name, extension = splitext(file_name) if extension not in DOC_FORMATS: continue # skip unsupported formats file_path = join(directory, file_name) print ' ', file_path # Build indexed document doc = Document() doc.add(Field('name', file_name, metadata)) doc.add(Field('path', directory, metadata)) # Read file contents content = process(file_path, 'utf-8', method='pdfminer') abstract = extract_abstract(content) doc.add(Field('content', content, content_type)) doc.add(Field('abstract', abstract, content_type)) writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) inFile = open(str(args["inputFile"])) indexName = inFile.readline() while (indexName != ''): print "adding", indexName doc = Document() doc.add(Field("name", indexName, t1)) #doc.add(Field("path", root, t1)) text = inFile.readline() if (len(text) > 0): print("contents: %s\n" % text) doc.add(Field("contents", text, t2)) else: print "warning: no content in %s" % indexName indexName = inFile.readline() writer.addDocument(doc) inFile.close()
def build_index(self, dict_data): print("loading data...") t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for k, v in dict_data.items(): doc = Document() doc.add(Field("id", k, t1)) doc.add(Field("content", v, t2)) self.writer.addDocument(doc) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() self.writer.commit() self.writer.close() ticker.tick = False print("done")
def indexDocs(self, img_url, toi, tid): try: t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) print("Adding", img_url) name = "Pictures/1.jpg" conn = urllib.urlopen(img_url) f = open(name, 'wb') f.write(conn.read()) f.close() img = cv2.imread(name) sdf = img_search_color(img) storeDir = 'Picture_new/' + sdf.strs if not os.path.exists(storeDir): os.mkdir(storeDir) cv2.imwrite(storeDir + '/' + str(toi) + '___' + str(tid) + '.jpg', img) '''storeDir2 = 'Picture_user/'+str(tid) if not os.path.exists(storeDir2): n = 0 os.mkdir(storeDir2) else : n = len(os.listdir(storeDir2)) cv2.imwrite(storeDir2+'/'+str(toi)+ '_'+ str(n) + '.jpg',img)''' except Exception, e: print("Failed in indexDocs:", e)
def index_docs(self, train_set, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for ii in train_set: doc = Document() doc.add(Field("answer", ii['Answer'], t1)) doc.add(Field("qid", ii['Question ID'], t1)) doc.add(Field("category", ii['category'], t1)) doc.add(Field("position", ii['Sentence Position'], t1)) doc.add(Field("question", ii['Question Text'], t2)) doc.add(Field("wiki_plain", self.wiki_reader.get_text(ii['Answer']), t2)) writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.html'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) wikiFile = ZipFile(root, 'r') files = wikiFile.namelist() i = 0 for file in files[1:]: i += 1 wiki = wikiFile.open(file, 'r') for line in wiki: for line in codecs.iterdecode(wiki, 'utf8'): normailized = unicodedata.normalize('NFD', line).split(' ', 2) if not normailized[1].isdigit(): continue docname = normailized[0] + ' ' + normailized[1] name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0]) contents = normailized[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i, file)
def index_docs(self, tweets, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t1.setStoreTermVectors(True) t1.setStoreTermVectorOffsets(True) # add each tweet to the index for tweet in tweets: try: # strip out URLs because they provide false index matches contents = [] for word in tweet[1].text.split(): if word.startswith("http://") or word.startswith("https://"): continue contents.append(word) contents = " ".join(contents) if len(contents) == 0: continue doc = Document() doc.add(Field("contents", contents, t1)) writer.addDocument(doc) except Exception, e: print "Failed in index_docs:", e
def indexDocs(self, url, writer): type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) type2 = FieldType() type2.setIndexed(True) type2.setStored(True) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) # Read Feeds feeds = feedparser.parse(url) for item in feeds["entries"]: print "adding", item["title"] try: link = item["link"] contents = item["description"].encode("utf-8") contents = re.sub('<[^<]+?>', '', ''.join(contents)) title = item["title"] doc = Document() doc.add(Field("url", link, type1)) doc.add(Field("title", title, type1)) if len(contents) > 0: doc.add(Field("contents", contents, type2)) else: print "warning: no content in %s" % item["title"] writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, sourceDir, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for sourceDir, dirnames, filenames in os.walk(sourceDir): for filename in filenames: if not filename.endswith('.txt'): continue print(filename) try: path = os.path.join(sourceDir, filename) file = open(path, 'r', encoding="utf-8") contents = file.read() #contents = str(filecontent, 'utf-8') #contents = filecontent.encode('utf-8') #print('path', path, len(contents)) doc = Document() doc.add(Field("name", filename, t1)) # filename (title) #doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field(queryField, contents, t2)) # content else: print("warning: no content in %s" % filename) writer.addDocument(doc) file.close() except NameError: print("Failed in indexDocs:")
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t3 = FieldType() t3.setIndexed(True) t3.setStored(False) t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格 t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) total=0 file = open(root,"r") for line in file.readlines(): try: imgurl, itemurl, content = line.split('\t') total+=1 print total print "adding", content contents = ' '.join(jieba.cut(content)) doc = Document() doc.add(Field("imgurl", imgurl, t1)) doc.add(Field("itemurl", itemurl, t1)) doc.add(Field("title", content, t1)) doc.add(Field("contents",contents,t3)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def _createNoTermsFrequencyFieldType(): f = FieldType() f.setIndexed(True) f.setTokenized(True) f.setOmitNorms(True) f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) f.freeze() return f
def indexDocs(self, root, writer, urlDic): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(False) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.htm') and not filename.endswith( '.html') and not filename.endswith( '.com') and not filename.endswith('.cn'): continue print "adding", filename try: path = os.path.join(root, filename) url = urlDic[filename] proto, rest = urllib.splittype(url) site, rest = urllib.splithost(rest) file = open(path) contents = file.read() file.close() soup = BeautifulSoup(contents, features='html.parser') title = soup.title.string title = unicode(title).encode('utf-8') title = title.replace("\n", '') contents = soup.get_text().encode('utf-8') seg_list = jieba.cut(contents) contents = " ".join(seg_list) doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", path, t1)) doc.add(Field("title", title, t1)) doc.add(Field("url", url, t1)) doc.add(Field("site", site, t3)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, indextxt, writer): t1 = FieldType() t1.setIndexed(True) #t1为需索引, 需保存, 需分词 t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() #t2为不需索引, 需保存, 需分词 t2.setIndexed(False) t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: print "adding", filename try: path = os.path.join(root, filename) file = open(path) content = unicode(file.read(), 'utf-8') content = content.encode('utf-8') list1 = content.split('\n') file.close() doc = Document() # 读取存于文件的内容并保存于doc url = list1[0] print('url : ' + url) doc.add(Field("url", url, t1)) name = list1[1] print('name : ' + name) doc.add(Field("name", name, t1)) collectnum = list1[2] print('collect_num : ' + collectnum) doc.add(Field("collect_num", collectnum, t2)) img_url = list1[3] print('img_url : ' + img_url) doc.add(Field("img_url", img_url, t2)) zhuliao = list1[4] print(zhuliao) doc.add(Field("zhuliao", zhuliao, t1)) zuofa = list1[5] zuofa = '\n'.join(zuofa.split('\t')) print('zuofa : ' + zuofa) doc.add(Field("zuofa", zuofa, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): # t1 is used for filenames and t2 is used for contents t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: # We can index only a certain types of files if not (filename.endswith('.txt') or filename.endswith('.pdf') or filename.endswith('.xml') or filename.endswith('.doc') or filename.endswith('.odt')): continue try: file_path = os.path.join(root, filename) outfile_path = file_path # First convert PDF and DOC files to text if filename.endswith('.pdf'): outfile = filename.replace('.pdf', '.txt') outfile_path = os.path.join(root, outfile) cmd = 'pdftotext ' + '-layout ' + "'"+ file_path + "'" + ' ' + "'" + outfile_path + "'" subprocess.check_output(cmd, shell=True) file_path = outfile_path elif filename.endswith('.doc'): outfile = filename.replace('.doc', '.txt') outfile_path = os.path.join(root, outfile) cmd = 'antiword ' + file_path + ' >> ' + outfile_path subprocess.check_output(cmd, shell=True) file_path = outfile_path elif filename.endswith('.odt'): outfile = filename.replace('.odt', '.txt') outfile_path = os.path.join(root, outfile) cmd = 'odttotext ' + '-layout ' + "'"+ file_path + "'" + ' ' + "'" + outfile_path + "'" subprocess.check_output(cmd, shell=True) file_path = outfile_path file = open(file_path) contents = unicode(file.read(), 'iso-8859-1') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: logging.debug('warning: no content in %s', filename) writer.addDocument(doc) except Exception, e: logging.debug('Failed in indexDocs: %s', e)
def indexDocs_playlist(self, writer): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # playlist success = 0 fail = 0 noinfo = 0 playlists = open('data/playlist_details2.db', "r") for line in playlists.readlines(): if len(line) < 20: noinfo += 1 continue information = line.split(',') try: playID = information[0] playname = ' '.join(information[1:-8]) author, playImage, tags, songIDs, sharecount, playcount, subscribedcount, commentcount = information[ -8:] except Exception, e: fail += 1 print "fail" continue playname = ' '.join(jieba.cut(playname)) author = ' '.join(jieba.cut(author)) tags = ' '.join(jieba.cut(tags)) # playname = ' '.join(pynlpir.segment(playname, pos_tagging=False)) # author = ' '.join(pynlpir.segment(author, pos_tagging=False)) tags = tags.replace("|", " ") doc = Document() doc.add(Field("ID", playID, t2)) doc.add(Field("name", playname, t2)) doc.add(Field("author", author, t2)) doc.add(Field("image", playImage, t1)) doc.add(Field("tags", tags, t2)) doc.add(Field("songIDs", songIDs, t1)) doc.add(Field("sharecount", sharecount, t1)) doc.add(Field("playcount", playcount, t1)) doc.add(Field("subscribedcount", subscribedcount, t1)) doc.add(Field("commentcount", commentcount, t1)) writer.addDocument(doc) print "歌单", playname, "成功添加" success += 1
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.NONE) # Not Indexed t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS ) # Indexes documents, frequencies and positions. count = 0 for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.shtml'): continue try: path = os.path.join(root, filename) with open(path, 'r') as file: contents = file.read() soup = BeautifulSoup(contents, features="html.parser") doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", path, t1)) url = self.relation[filename] doc.add(Field("url", url, t1)) if len(contents) > 0: title = soup.find('title').text #开始处理开头 content = "".join(soup.findAll(text=True)) content = jieba.lcut(content) content = ' '.join(content) doc.add(Field("title", title, t1)) doc.add(Field("contents", content, t2)) else: doc.add(Field("title", "", t1)) doc.add(Field("contents", "", t2)) print("warning: no content in %s" % filename) writer.addDocument(doc) except Exception as e: print("Failed in indexDocs:", e) count += 1 if (count % 100 == 0): writer.commit() print(count)
def create_minidoc(termstring, field='text'): # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) doc = Document() doc.add(Field(field, termstring, customfield)) return doc
class Indexer(object): def __init__(self, **kwargs): """ Initialize a new instance of the Indexer :param output: The output directory of the underlying index :param anaylzer: The overloaded analyzer to work with """ self.output = kwargs.get("root", "index") if not os.path.exists(self.output): os.mkdir(self.output) self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT)) self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.store = SimpleFSDirectory(File(self.output)) self.writer = IndexWriter(self.store, self.config) self.create_field_types() def index(self, document): """ Given a new document, add it to the index. :param document: The document to add to the indexer """ try: self.writer.addDocument(document) except Exception: logger.exception("Failed to index the supplied document") def shutdown(self): """ Shutdown the currently processing indexer. """ try: # self.writer.optimize() self.writer.close() except Exception: logger.exception("Failed to shutdown the indexer correctly") def create_field_types(self): """ Create the field types that will be used to specify what actions lucene should take on the various fields supplied to index. """ self.field_clean = FieldType() self.field_clean.setIndexed(True) self.field_clean.setStored(True) self.field_clean.setTokenized(False) self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.field_dirty = FieldType() self.field_dirty.setIndexed(True) self.field_dirty.setStored(False) self.field_dirty.setTokenized(True) self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def indexDocs(self, root, writer): t1 = FieldType() #t1 is used in URL fields t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() #t2 is used to index contents t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() #t3 is used to index titles t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) indextxt = open(self.filedir, 'r') while True: t = indextxt.readline() if (len(t) == 0): indextxt.close() return filename = t.strip() # for root, dirnames, filenames in os.walk(root): # for filename in filenames: print "updating", filename try: path = os.path.join(root, filename) file = open(path) title = file.readline() print title page_URL = file.readline() while True: imgsrc = file.readline() if (imgsrc == 'EOF'): file.close() break contents = file.readline() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("imgurl", imgsrc, t1)) doc.add(Field("url", page_URL, t1)) doc.add(Field("title",title, t3)) doc.add(Field("contents", contents, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) files = os.listdir(root) for filename in files: print "adding", filename try: path = os.path.join(root, filename) file = open(path) total = file.read() totallist=total.split("\n") geshou=totallist[0] geming=totallist[1] zhuanji=totallist[2] imgurl=totallist[3] liupai=totallist[4] shijian=totallist[5] jianjie=totallist[6] geci=totallist[7] contents = geming+geshou+zhuanji+liupai+geci seg_result = jieba.cut(contents) contents = ' '.join(seg_result) doc = Document() doc.add(Field("contents",contents,t2)) doc.add(Field("geming", geming, t1)) doc.add(Field("geshou", geshou, t1)) doc.add(Field("zhuanji", zhuanji, t1)) doc.add(Field("liupai",liupai,t1)) doc.add(Field("geci",geci,t1)) doc.add(Field("imgurl", imgurl, t1)) doc.add(Field("shijian",shijian,t1)) doc.add(Field("jianjie",jianjie,t1)) writer.addDocument(doc) file.close() except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() #t1 is used in path and URL fields t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() #t2 is used to index contents t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() #t3 is used to index titles t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) indextxt = open(self.filedir, 'r') while True: t = indextxt.readline() if (len(t) == 0): indextxt.close() return t = t.split() filename = t[1] URL = t[0] # for root, dirnames, filenames in os.walk(root): # for filename in filenames: print "adding", filename try: path = os.path.join(root, filename) file = open(path) title = file.readline() print title contents = unicode(file.read()) file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", path, t1)) doc.add(Field("url", URL, t1)) doc.add(Field("title",title, t3)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: # if not filename.endswith('.txt'): # continue print "adding", filename try: path = os.path.join(root, filename) pwd = os.path.join(os.getcwd() + '/' + root, filename) file = open(path) url = file.readline() title = file.readline() tag = file.readline() imgurl = file.readline() price = file.readline() wellrate = file.readline() comment = file.readline() contents = unicode(file.read(), 'utf8') file.close() doc = Document() doc.add(Field('url', url, t1)) doc.add(Field('title', title, t1)) doc.add(Field('imgurl', imgurl, t1)) doc.add(Field('price', price, t1)) doc.add(Field('wellrate', wellrate, t1)) doc.add(Field('comment', comment, t1)) if len(tag) > 2: doc.add(Field('tag', tag, t2)) else: doc.add(Field('tag', ' ', t2)) #doc.add(Field('comment', comment, t1)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, dialog, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) f = open(dialog) lines = f.readlines() for line in lines: info = line.split('\t') url = info[0] filename = info[1] title = info[2] print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field('name', filename, t1)) doc.add(Field('path', path, t1)) doc.add( Field('url', url, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: soup = BeautifulSoup(contents) try: title = soup.title.text doc.add( Field('title', title, Field.Store.YES, Field.Index.ANALYZED)) except Exception, e: doc.add( Field('title', 'none', Field.Store.YES, Field.Index.ANALYZED)) contents = ''.join(soup.findAll(text=True)) doc.add(Field("contents", analysis(contents), t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc)
def index_article(writer, art_id, art_body): art_id_field = FieldType() art_id_field.setIndexed(True) art_id_field.setStored(True) art_id_field.setTokenized(False) art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) art_body_field = FieldType() art_body_field.setIndexed(True) art_body_field.setStored(True) art_body_field.setTokenized(True) art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc = Document() doc.add(Field("art_id", str(art_id), art_id_field)) doc.add(Field("art_body", art_body, art_body_field)) writer.addDocument(doc)
def setUp(self): super(Test_Bug1842, self).setUp() self.analyzer = StandardAnalyzer() w1 = self.getWriter(analyzer=self.analyzer) doc1 = Document() ftype = FieldType() ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ftype.setTokenized(True) ftype.setStoreTermVectors(True) ftype.freeze() doc1.add(Field("all", "blah blah blah Gesundheit", ftype)) doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED)) w1.addDocument(doc1) w1.close()
def lazyImport(): global imported if imported: return from meresco.pylucene import getJVM getJVM() from java.nio.file import Paths from org.apache.lucene.document import Document, StringField, Field, FieldType from org.apache.lucene.search import IndexSearcher, TermQuery from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from org.apache.lucene.analysis.core import WhitespaceAnalyzer UNINDEXED_TYPE = FieldType() UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE) UNINDEXED_TYPE.setStored(True) UNINDEXED_TYPE.setTokenized(False) imported = True globals().update(locals())
def indexDocs(self, root, writer): #Create a new FieldType with default properties. t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer. t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #Create a new FieldType with default properties. t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer. t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print 'adding', filename try: path = os.path.join(root, filename) file = open(path) contents = file.read() file.close() doc = Document() doc.add(Field('name', filename, t1)) doc.add(Field('path', root, t1)) if len(contents) > 0: doc.add(Field('contents', contents, t2)) print 'length of content is %d'%(len(contents)) else: print 'warning: no content in %s' % filename writer.addDocument(doc) except Exception, e: print 'Failed in indexDocs:', e
def indexDocs(self,root,writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames,filenames in os.walk(root): # traverse through the doc directory for filename in filenames: # only if this file ends with '.c' if not filename.endswith('.c'): continue try: # only add the filename and path for indexing path = os.path.join(root,filename) print "adding file : ",path file = open(path) contents = unicode(file.read(),'utf-8') file.close() doc = Document() doc.add(Field("name",filename,t1)) doc.add(Field("path",root,t1)) # if len(contents) > 0: # doc.add(Field("contents",contents,t2)) # else: # print "warning: no content in ",filename writer.addDocument(doc) except Exception,e: print "failed in indexDocs:",e
def index_wiki(wiki_xmlfile, index_directory_name): lucene.initVM() # Initialize index directory and analyzer. version = Version.LUCENE_CURRENT store = FSDirectory.open(File(index_directory_name)) analyzer = StandardAnalyzer(version) # Creates config file. config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) # Set document content field type. content_fieldtype = FieldType() content_fieldtype.setIndexed(True) content_fieldtype.setStored(True) content_fieldtype.setTokenized(True) content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document title field type. title_fieldtype = FieldType() title_fieldtype.setIndexed(True) title_fieldtype.setStored(True) title_fieldtype.setTokenized(True) title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document url field type. url_fieldtype = FieldType() url_fieldtype.setIndexed(True) url_fieldtype.setStored(True) url_fieldtype.setTokenized(False) url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for xmldoc in wikicorpusxml((wiki_xmlfile)): content = xmldoc.partition('>')[2].partition('<')[0].strip() title = xmldoc.partition(' title="')[2].partition('"')[0].strip() url = xmldoc.partition(' url="')[2].partition('"')[0].strip() doc = Document() doc.add(Field("contents", content, content_fieldtype)) doc.add(Field("title", title, title_fieldtype)) doc.add(Field("url", url, url_fieldtype)) writer.addDocument(doc) writer.commit() writer.close()
def indexDocs(self, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for folder in os.listdir(rawDir): for fileName in os.listdir(rawDir+'/'+folder): if not fileName.endswith('.json'): continue print fileName subject_id = fileName.split('_')[0] # print rawDict print 'id:'+subject_id print 'folder:'+str(folder) rawPath = rawDir+'/' +folder +'/'+subject_id+'_res.json' adjPath = adjDir+'/' +folder +'/'+subject_id+'_adj.json' #tfidfPath = tf_idfDir + '/' + subject_id+'_tfidf.json' print adjPath rawFile = open(rawPath,'r') raw = rawFile.read() if raw =='': with open(baseDir+'/'+'err_no_raw_content.txt','a') as err: err.write(subject_id+'\n') # if subject_id =='6018943': # rawFile.seek(0) rawFile.close() adjFile = open(adjPath,'r') adj = adjFile.read() adjFile.close() raw = getRidOfBOM(raw) adj = getRidOfBOM(adj) if raw != '': rawDict = json.loads(raw) adjDict = json.loads(adj) rawAll = rawDict['summary'] +' '+ rawDict['user_tags'] + ' '+rawDict['comments'] summary_adjs = adjDict['summary_adjs'] comments_adjs = adjDict['comments_adjs'] title = adjDict['title'] rating_average = adjDict['rating_average'] comments_count = adjDict['comments_count'] doc = Document() doc.add(Field("folder",folder,t1)) doc.add(Field("title", title, t1)) doc.add(Field("subject_id", subject_id, t1)) doc.add(IntField("comments_count", comments_count, Field.Store.YES)) doc.add(FloatField("rating_average", rating_average, Field.Store.YES)) if len(summary_adjs)>0: print summary_adjs exit() doc.add(Field("summary_adjs", summary_adjs, t2)) #if len(comments_adjs)>0: doc.add(Field("comments_adjs", comments_adjs, t2)) writer.addDocument(doc)
def tweetIndexer(self, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) x = 0 for i in range(0,500): if not os.path.isfile("json/tweets-" + str(i) + ".json"): break print "adding tweets-" + str(i) + ".json" tweets = open("json/tweets-" + str(i) + ".json", "r") for line in tweets.readlines(): tweet = json.loads(line) if 'limit' in tweet: continue try: doc = Document() doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1)) sname = tweet['user']['screen_name'] tid = str(tweet['id']) text = tweet['text'] uname = tweet['user']['name'] created = tweet['created_at'] tstamp = tweet['timestamp_ms'] place = "" if tweet['place']: place = tweet['place']['full_name'] + ", " + tweet['place']['country'] lat = "" lng = "" titles = "" urls = "" exist = "false" if tweet['coordinates']: lat = str(tweet['coordinates']['coordinates'][1]) lng = str(tweet['coordinates']['coordinates'][0]) else: lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2) lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2) if len(tweet['entities']['urls']) != 0: exist = "true" for index in range(len(tweet['entities']['urls'])): title = tweet['entities']['urls'][index]['url_title'] if title == None: titles += ",-" else: title = title.encode('ascii','ignore') titles += "," + str(title) urls += " " + str(tweet['entities']['urls'][index]['expanded_url']) searchable = text + " " + urls + " " + uname + " " + sname + " " + place doc.add(Field("lookup", searchable, t2)) doc.add(Field("text", text, t2)) doc.add(Field("user_name", uname, t2)) doc.add(Field("screen_name", sname, t2)) doc.add(Field("tweet_id", tid, t2)) doc.add(Field("created_at", created, t2)) doc.add(Field("geo_lat", lat, t2)) doc.add(Field("geo_lng", lng, t2)) doc.add(Field("url_exist", exist, t2)) doc.add(Field("url_url", urls, t2)) doc.add(Field("url_title", titles, t2)) doc.add(Field("timestamp", tstamp, t2)) writer.addDocument(doc) x += 1 except Exception, e: pass tweets.close()
def __init__(self): self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers() ############################# Writer Configurattion ##################################### map = HashMap() map.put('name', self.mAnalyzers['name']) map.put('parent', self.mAnalyzers['parent']) map.put('content', self.mAnalyzers['default']) map.put('id', self.mAnalyzers['id']) analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map) self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper) self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode) if settings.ADMINS_ENGINE.mSimilarity != None: self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity) ######################################################################################## directory = SimpleFSDirectory(File(self.mIndexDirectory)) self.mIndexWriter = IndexWriter(directory, self.mWriterConfig) ############################# FieldType Prepration ##################### nameField = FieldType() nameField.setIndexed(True) nameField.setStored(True) nameField.setTokenized(True) nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) parentField = FieldType() parentField.setIndexed(True) parentField.setStored(True) parentField.setTokenized(True) parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) contentField = FieldType() contentField.setIndexed(True) contentField.setStored(True) contentField.setTokenized(True) contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) idField = FieldType() idField.setIndexed(True) idField.setStored(True) idField.setTokenized(False) idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) self.mFieldTypes = { 'name' : nameField, 'parent' : parentField, 'content' : contentField, 'id' : idField } ####################################################################### self.mLog = ""
def indexCranFull(path, writer): """ This method reads in the cran.1400 file and creates an index out of it. These fields are used when storing documents: title: title of the document (in a line starting with .T) author: author of the document (in a line starting with .A) source: where this article has been published, not yet used (in a line starting with .B) content: body text of the article (in a line starting with .W) """ # Title field type tft = FieldType() tft.setIndexed(True) tft.setStored(True) tft.setTokenized(TokenizeFields) tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #only index the document and frequency data # Author field type aft = FieldType() aft.setIndexed(True) aft.setStored(True) aft.setTokenized(TokenizeFields) aft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #index the document, term frequency and position data # Content field type cft = FieldType() cft.setIndexed(True) cft.setStored(True) cft.setTokenized(True) cft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) txt = open(path) """ cran documents are listed in this order: .I # -> ignore .T -> starting on the next line, multi-line title .A -> on the next line, multiple authored separated by 'and' .B -> on the next line, ignore .W -> starting on the next line, multi-line body (content) """ docid = 0 debug = False while True: # in each iteration, read all the lines corresponding to a document line = txt.readline() if line == '': break if docid == 1400: debug = True if line.startswith('.I'): docid = int(line.split(' ')[1].strip()) continue if line.startswith('.T'): title = '' while True: line = txt.readline() if line.startswith('.A'): break title = ' '.join([title, line]).strip() line = txt.readline() # authors = ' '.join(line.split('and')) authors = line.strip() #.B, its corresponding line and .W txt.readline() txt.readline() txt.readline() body = '' while True: line = txt.readline() if line.startswith('.I'): docid = int(line.split(' ')[1].strip()) break if line == '': break body = ' '.join([body, line]).strip() doc = Document() doc.add(Field(title_field, title, tft)) doc.add(Field(author_field, authors, aft)) doc.add(Field(content_field, body, cft)) doc.add(IntField(docid_field, docid, Field.Store.YES)) writer.addDocument(doc)
IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = RAMDirectory() iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100)) iwriter = IndexWriter(directory, iconfig) ft = FieldType() ft.setStored(True) ft.setTokenized(True) ft.setStoreTermVectors(True) ft.setStoreTermVectorOffsets(True) ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = ["this bernhard is the text to be index text", "this claudia is the text to be indexed"] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory) for doc in xrange(0, len(ts)): tv = ireader.getTermVector(doc, "fieldname") termsEnum = tv.iterator()
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: print "adding", filename doc_parser = HTMLDocumentParser() try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') doc_parser.feed(contents) contents = doc_parser.contents html_doc = HTMLDocument(contents) flag = False if flag: print '==============' print 'Title: ' + html_doc.title print 'Description: ' + html_doc.description print 'Month: ' + html_doc.month print 'Year: ' + html_doc.year print 'Authors: ' + str(html_doc.authors) print 'Keywords: ' + str(html_doc.keywords) print 'Timestamp: ' + str(html_doc.timestamp) print ' ' file.close() doc = Document() field_filename = FieldType() field_filename.setIndexed(True) field_filename.setStored(True) field_filename.setTokenized(False) field_filename.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("filename", filename.replace('.html', ''), field_filename)) field_path = FieldType() field_path.setIndexed(True) field_path.setStored(True) field_path.setTokenized(True) field_path.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("path", root, field_path)) field_title = FieldType() field_title.setIndexed(True) field_title.setStored(True) field_title.setTokenized(True) field_title.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("title", html_doc.title, field_title)) field_description = FieldType() if html_doc.has_description(): field_description.setIndexed(True) else: field_description.setIndexed(True) field_description.setStored(True) field_description.setTokenized(True) field_description.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("description", html_doc.description, field_description)) field_month = FieldType() field_month.setIndexed(True) field_month.setStored(True) field_month.setTokenized(False) field_month.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("month", html_doc.month, field_month)) field_year = FieldType() field_year.setIndexed(True) field_year.setStored(True) field_year.setTokenized(False) field_year.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("year", html_doc.year, field_year)) if html_doc.has_authors(): field_author = FieldType() field_author.setIndexed(True) field_author.setStored(True) field_author.setTokenized(True) field_author.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) for author in html_doc.authors: doc.add(Field("author", author, field_author)) if html_doc.has_keywords(): field_keyword = FieldType() field_keyword.setIndexed(True) field_keyword.setStored(True) field_keyword.setTokenized(True) field_keyword.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) for keyword in html_doc.keywords: doc.add(Field("keyword", keyword, field_keyword)) field_timestamp = FieldType() field_timestamp.setIndexed(False) field_timestamp.setStored(True) field_timestamp.setTokenized(False) field_timestamp.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("timestamp", html_doc.timestamp, field_timestamp)) if len(contents) > 0: field_source = FieldType() field_source.setIndexed(True) field_source.setStored(True) field_source.setTokenized(True) field_source.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("contents", contents, field_source)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexTable(self, writer): #connection con = None #define the index of all the fields #---------step 2---------- con = mdb.connect('localhost','root','testgce','moviedata') #t_num = FieldType.NumericType it is wrong!! t_num = FieldType() t_num.setStored(False) t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) with con: # Careful with codecs con.set_character_set('utf8') cur = con.cursor() # Aagin the codecs cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') #------step 3------ cur.execute("SELECT * FROM movie_items") numrows = int(cur.rowcount) print 'numrows:',numrows for i in range(numrows): row = cur.fetchone() #------step 4------ summary = row[SUMMARY] subject_id = row[SUBJECT_ID] print 'id'+subject_id #print 'summary'+summary+'end' doc = Document() #fields which should not be analyzed doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.NO)) doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.NO)) doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.NO)) #doc.add(FloatField("year", float(row[YEAR]), Field.Store.NO)) doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.NO)) doc.add(IntField("subject_id", int(subject_id), Field.Store.YES)) doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.NO)) doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.NO)) doc.add(Field("image_small", row[IMAGE_SMALL], t1)) #fields which should be analyzed with WhitespaceAnalyzer doc.add(Field("countries", row[COUNTRIES].replace(delim,' '), t3)) doc.add(Field("casts", row[CASTS].replace(delim,' '), t3)) doc.add(Field("genres", row[GENRES].replace(delim,' '), t3)) doc.add(Field("subtype", row[SUBTYPE].replace(delim,' '), t2)) doc.add(Field("directors", row[DIRECTORS].replace(delim,' '), t3)) user_tags_str = '' others_like_str = '' # print 'user_tags'+row[USER_TAGS] # print 'others_like'+row[OTHERS_LIKE] if row[USER_TAGS]!='': for tag_pair in row[USER_TAGS].split(delim): if tag_pair!='':#字符串的最后一个字符是:,这样split之后最后一个元素是空字符 user_tags_str = user_tags_str +' '+tag_pair.split(delim_uo)[0] if row[OTHERS_LIKE]!='': for like_pair in row[OTHERS_LIKE].split(delim): if like_pair!='': others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1] # print user_tags_str # print others_like_str doc.add(Field("user_tags", user_tags_str, t3)) doc.add(Field("others_like", others_like_str, t3)) #fields which should be analyzed with good analyzer doc.add(Field("title", row[TITLE], t3)) doc.add(Field("original_title", row[ORIGINAL_TITLE], t2)) doc.add(Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t2)) doc.add(Field("aka", row[AKA], t2)) if len(summary) > 0: print subject_id +'--->'+':\n '+ row[TITLE] try: summary_unicoded = unicode(summary, 'utf-8') #test the encoding except Exception,e: print "Decode Failed: ", e doc.add(Field('summary', summary, t2)) else: print "warning:\n" + subject_id +'---> No content!' writer.addDocument(doc)
def reindex(self): ''' Re-indexes the entire database into Index file''' start = time.time() # get all posts posts = self._tuples_to_dict(self._fetch_all_questions(), self._posts_fields) if not posts: raise Exception("FATAL Error: Could not fetch posts from Database") # open indexer # lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION store = SimpleFSDirectory(File(self.index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) indexedField = FieldType() indexedField.setIndexed(True) indexedField.setStored(True) indexedField.setTokenized(True) indexedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) storedField = FieldType() storedField.setIndexed(False) storedField.setStored(True) storedField.setTokenized(False) storedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) fieldTypes = { 'type' : storedField, 'id' : storedField, 'title' : indexedField, 'question' : indexedField, 'answer' : indexedField, # 'comment' : indexedField, 'tag' : indexedField, 'extra' : indexedField, } # get their comments num_docs = 0 for post in posts: if self.status_mode: print "\r {0:.2f} %complete".format(((num_docs/142627.0)*100)), if self.debug : print "\n","*"*20,"\nIndexing post: ", post['id'], "from ", post['extra'] if self.debug and self.verbose_values: print post answers = self._tuples_to_dict(self._fetch_all_answers(post['id'], post['extra']), self._answer_fields) # add comment field for answer in answers: num_docs += 1 if self.debug: print "\n","+"*10, "\nMaking new Document" doc = Document() if self.debug: print "Adding doc type" doc.add(Field("type", self.doctype, fieldTypes['type'])) # make fields if self.debug: print "Adding post fields" for i in xrange(len(self._posts_fields)): f = Field(self._posts_fields[i], self._cleanup_tag(post[self._posts_fields[i]]), fieldTypes[self._posts_fields[i]]) f.setBoost(self._fields_boost[self._posts_fields[i]]) doc.add(f) if self.status_mode: print "\t Indexing answer: ", answer['answer_id'] if self.debug and self.verbose_values: print answer # answered_doc = copy.deepcopy(doc) # make comment field f = Field("answer", self._cleanup_tag(answer['answer']), fieldTypes['answer']) f.setBoost(self._fields_boost['answer']) doc.add(f) # calculate paths # commented_doc = copy.deepcopy(answered_doc) # comments = self._comments_to_comment_string(self._tuples_to_dict(self._fetch_all_comments(answer['id']), self._comment_fields)) # if self.debug: print "\t\tAdding comments: ", comments # commented_doc.add(Field("comment", self._cleanup_tag(comments), fieldTypes['comment'])) # write index if self.debug: print "\tAdding document {doc_id} to index".format(doc_id=post['id']) writer.addDocument(doc) # del answered_doc # del commented_doc if self.debug: print "Commiting document to index" writer.commit() # close index if self.status_mode: print "Closing index write" writer.close() end = time.time() - start if self.status_mode: print "\n","-"*20, \ "\nTotal time spent in indexing: ", end, "seconds" \ "\nIndexed {num_docs} documents".format(num_docs=num_docs)
def indexTable(self, writer): #connection con = None #define the index of all the fields #---------step 2:connect to mysql---------- con = mdb.connect('localhost','root','testgce','douban_movie_v3') #t_num = FieldType.NumericType it is wrong!! t_num = FieldType() t_num.setStored(False) t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) maxDict = utils.maxDict #加权数值范围 base = DOC_BOOST_RANGE[0] upper = DOC_BOOST_RANGE[1] with con: # Careful with codecs con.set_character_set('utf8') cur = con.cursor() # Aagin the codecs cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') #------step 3: choose the right table------ cur.execute("SELECT * FROM movie_items") numrows = int(cur.rowcount) print 'numrows:',numrows for i in range(numrows): print row = cur.fetchone() #------step 4:Index your field------ summary = row[SUMMARY] subject_id = row[SUBJECT_ID] print 'id'+subject_id year = utils.formatYear(row[YEAR]) try: date = DateTools.stringToDate(year.replace('-',' ')) wtfFile = open('wtf.txt','a') dateStr = DateTools.dateToString(date,DateTools.Resolution.DAY) except: wtfFile.write(year+'\n') doc = Document() #boosting boostProb = utils.calcBoostProb(row,maxDict,dateStr) boost = base + boostProb*(upper-base) doc.add(FloatField("boost",boost,Field.Store.YES)) doc.add(StringField("year",dateStr,Field.Store.YES)) print 'dateStr:'+dateStr #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.) do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0 wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0 #fields which should not be analyzed doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES)) doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES)) doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES)) #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost)) doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES)) doc.add(IntField("do_count", int(do_count), Field.Store.YES)) doc.add(IntField("wish_count", int(wish_count), Field.Store.YES)) doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES)) doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES)) doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES)) doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES)) #fields which should be analyzed with WhitespaceAnalyzer #attention!!! dont use a long sentence like : #doc.add(Field("genres", row[GENRES].replace(delim,' '), t3).setBoost(boost)) #or you'll get a null pointer error f = Field("countries", row[COUNTRIES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #process casts raw_casts = row[CASTS].replace(delim,' ') f = Field("raw_casts", raw_casts , t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · raw_casts = raw_casts.replace('·',' ') if len(raw_casts.split(' '))<CASTS_LEN: #平局人名长度是4 casts = raw_casts + ' ¥¥¥¥'*(CASTS_LEN-len(raw_casts.split(' '))) f = Field("casts", casts , t3) f.setBoost(boost) doc.add(f) #process directors raw_directors = row[DIRECTORS].replace(delim,' ') f = Field("raw_directors",raw_directors, t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · 替换 raw_directors = raw_directors.replace('·',' ') if len(raw_directors.split(' '))<DIRECTORS_LEN: #平局人名长度是4 directors = raw_directors + ' ¥¥¥¥'*(DIRECTORS_LEN-len(raw_directors.split(' '))) f = Field("directors", directors, t3) f.setBoost(boost) doc.add(f) Field("genres", row[GENRES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) Field("subtype", row[SUBTYPE].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #it is wrong cause indexable field has no method setBoost # fieldList = doc.getFields() # is not a python 'list' , but a 'List' which is unindexable # for eachField in fieldList: # eachField.setBoost(boost) #user_tags 原始字符串要存,reRank要用: doc.add(StringField("raw_user_tags",row[USER_TAGS],Field.Store.YES)) doc.add(StringField("raw_others_like",row[OTHERS_LIKE],Field.Store.YES)) user_tags_str = '' others_like_str = '' tags_len = 0 if row[USER_TAGS]!='': user_tags_list = row[USER_TAGS].split(delim) for tag_pair in user_tags_list: if tag_pair!='':#字符串的最后一个字符是¥,这样split之后最后一个元素是空字符 #print 'tag_pair'+tag_pair+'hhe' tag_name = tag_pair.split(delim_uo)[0]+' ' # dont forget this space !! tag_num = tag_pair.split(delim_uo)[1] tag_num_processed = int(int(tag_num)/TAG_SPAN)+1 #最小为1 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! user_tags_str = user_tags_str +' '+ tag_name * tag_num_processed tags_len = tags_len + tag_num_processed #最后得到总共词的个数 if tags_len<TAGS_AVER_LEN: #填充tags,目测3是平均长度,所以使用 ¥¥¥ user_tags_str = user_tags_str +' ¥¥¥'*(TAGS_AVER_LEN - tags_len) # if row[OTHERS_LIKE]!='': for like_pair in row[OTHERS_LIKE].split(delim): if like_pair!='': others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1] #start process adjs if row[ADJS] != None: raw_adjs = row[ADJS][:-1] adjs_str = '' adjs_len = 0 if row[ADJS] != '' and row[ADJS] != '\n': #'重要=4.0,特殊=4.0' adjs_str = row[ADJS] adjs_list = adjs_str.split(',') for adj_pair in adjs_list: #print 'adj_pair:'+adj_pair+'hhe' adj_name = adj_pair.split('=')[0] adj_num = adj_pair.split('=')[1] #去换行符,转换int if adj_num[-1] == '\n': adj_num = adj_num[0:-1] adj_num = int(float(adj_num)) add_adj='' # #同义词 # adj_name_bro = searchDictValue(adjMap,adj_name) # if adj_name_bro == -1: #表示没有结果,即未找到近义词,不添加 # add_adj = '' # else: # add_adj = (adj_name_bro+' ')*adj_num # raw_adjs = raw_adjs + ',' + adj_name_bro+'='+str(adj_num) adjs_str = adjs_str + ' ' + (adj_name+' ') * adj_num +add_adj adjs_len = adjs_len + adj_num #最后得到总共tags的个数 #print raw_adjs doc.add(StringField("raw_adjs",raw_adjs,Field.Store.YES)) if adjs_len<ADJS_AVER_LEN: #填充 adjs_str,目测2是平均长度,所以使用 "¥¥" adjs_str = adjs_str +' ¥¥'*(ADJS_AVER_LEN - adjs_len) f = Field("adjs", adjs_str, t3) f.setBoost(boost) doc.add(f) f = Field("user_tags", user_tags_str, t3) f.setBoost(boost) doc.add(f) f = Field("others_like", others_like_str, t3) f.setBoost(boost) doc.add(f) #fields which should be analyzed with good analyzer f = Field("title", row[TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("original_title", row[ORIGINAL_TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t3) f.setBoost(boost) doc.add(f) f = Field("aka", row[AKA], t2) f.setBoost(boost) doc.add(f) if len(summary) > 0: print subject_id +'--->'+':\n '+ row[TITLE] try: summary_unicoded = unicode(summary, 'utf-8') #test the encoding except Exception,e: print "Decode Failed: ", e f = Field('summary', summary, t2) f.setBoost(boost) doc.add(f) else: print "warning:\n" + subject_id +'---> No content!' print 'boosting:' + str(boost) #for debug if boost>upper: print boostProb print maxDict exit(0) writer.addDocument(doc)
def indexDocs(root, writer): """ indexed: name title content stored: date name tilte sumary :param root: :param writer: :return: """ #index and store t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #only index, but not store t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #only store t3 = FieldType() t3.setIndexed(False) t3.setStored(True) t3.setTokenized(False) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) for root, dirnames, filenames in os.walk(root): print filenames for filename in filenames: if not filename.endswith('.md'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'utf-8') file.close() date, name = get_date_name(filename) title, content = get_post_title_content(contents) summary = content[:200] if content else '' print date, name, title doc = Document() doc.add(Field('date', date, t3)) doc.add(Field('name', name, t1)) doc.add(Field('title', title, t1)) doc.add(Field('content', content, t2)) doc.add(Field('summary', summary, t3)) # doc.add(Field("name", filename, t1)) # doc.add(Field("path", root, t1)) # if len(contents) > 0: # doc.add(Field("contents", contents, t2)) # else: # print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e