コード例 #1
0
ファイル: indexfiles.py プロジェクト: mefagan/artsearch
	def indexDocs(self, root, writer):
		for root, dirnames, filenames in os.walk(root):
			for filename in filenames:
				if filename.endswith('.DS_Store'):
					continue
				print("adding" + filename)
				try:
					path = os.path.join(root, filename)
					file = open(path)
					contents = unicode(file.read(), 'iso-8859-1')
					#contents = file.read()
					print(contents)
					file.close()
					doc = lucene.Document()
					doc.add(lucene.Field("name", filename,
						lucene.Field.Store.YES,
						lucene.Field.Index.NOT_ANALYZED))
					doc.add(lucene.Field("path", path,
						lucene.Field.Store.YES,
						lucene.Field.Index.NOT_ANALYZED))
					if len(contents) > 0:
						doc.add(lucene.Field("contents", contents,
							lucene.Field.Store.NO,
							lucene.Field.Index.ANALYZED))
					else:
						print("no content")
						print("warning: no content in %s" % filename)
					writer.addDocument(doc)
				except Exception, e:
					print("Failed in indexDocs:" + e)
コード例 #2
0
ファイル: IndexFiles.py プロジェクト: Riolu/Project_Set
    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root): #遍历testfolder下的文件
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'gbk') #将文件转为unicode再处理,假设原doc编码为GBK。
                    print contents                                        #文件内容存放在contents中
                    file.close()
                    doc = lucene.Document() #创建一个Document代表我们要索引的文档
                    doc.add(lucene.Field("name", filename,
                                         lucene.Field.Store.YES,
                                         lucene.Field.Index.NOT_ANALYZED))
                    doc.add(lucene.Field("path", path,
                                         lucene.Field.Store.YES,
                                         lucene.Field.Index.NOT_ANALYZED))
                    if len(contents) > 0:
                        doc.add(lucene.Field("contents", contents,
                                             lucene.Field.Store.NO,
                                             lucene.Field.Index.ANALYZED))
                        #将不同的Field加入到文档中。一篇文档有多种信息,如题目,作者,修改时间,内容等。
                        #不同类型的信息用不同的Field来表示,在本例子中,一共有三类信息进行了索引,一个是
                        #文件路径,一个是文件名,一个是文件内容。

                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中
                except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #3
0
ファイル: index.py プロジェクト: nihaofuyue0617/pythia
 def add_document(self, document):
     '''
     Adds a new document in the index.
     '''
     doc = lucene.Document()
     try:
         #All fields are converted to string since Lucene accepts only textual fields (and binary)
         doc.add(
             lucene.Field("id", str(document.id), lucene.Field.Store.YES,
                          lucene.Field.Index.NOT_ANALYZED))
         doc.add(
             lucene.Field("content", ' '.join(document.content['tokens']),
                          lucene.Field.Store.YES,
                          lucene.Field.Index.ANALYZED))
         doc.add(
             lucene.Field("author", document.author_screen_name,
                          lucene.Field.Store.YES,
                          lucene.Field.Index.NOT_ANALYZED))
         formatted_date = lucene.SimpleDateFormat("yyyyMMddHHmmss").parse(
             str(document.date))
         doc.add(
             lucene.Field(
                 "date",
                 lucene.DateTools.dateToString(
                     formatted_date, lucene.DateTools.Resolution.MINUTE),
                 lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
         self.writer.addDocument(doc)
     except Exception, e:
         print "Failed in indexDocs:", e
コード例 #4
0
ファイル: social.lucene.py プロジェクト: shunsunsun/Research
 def indexHKForumPost(self, pid, uid, tid, title, content, floor, time):
     try:
         doc = lucene.Document()
         doc.add(
             lucene.NumericField("pid", 8, lucene.Field.Store.YES,
                                 True).setLongValue(long(pid)))
         doc.add(
             lucene.NumericField("uid", 8, lucene.Field.Store.YES,
                                 True).setLongValue(long(uid)))
         doc.add(
             lucene.NumericField("tid", 8, lucene.Field.Store.YES,
                                 True).setLongValue(long(tid)))
         doc.add(
             lucene.Field("title", title, lucene.Field.Store.NO,
                          lucene.Field.Index.ANALYZED))
         doc.add(
             lucene.Field("content", content, lucene.Field.Store.NO,
                          lucene.Field.Index.ANALYZED))
         doc.add(
             lucene.NumericField("floor", lucene.Field.Store.YES,
                                 True).setIntValue(floor))
         doc.add(
             lucene.NumericField("time", lucene.Field.Store.YES,
                                 True).setIntValue(time))
         self.writer.addDocument(doc)
     except Exception, e:
         print "Failed in indexWeibos:", e
コード例 #5
0
 def testAdd(self, filepath):
     writer = lucene.IndexWriter(self.dir, self.getAnalyzer(), False,
                                 lucene.IndexWriter.MaxFieldLength.UNLIMITED)
     #True,建立新索引,False,建立增量索引
     file = open(filepath)
     contents = unicode(file.read(), 'gbk')
     file.close()
     doc = lucene.Document()
     doc.add(lucene.Field("name", os.path.basename(filepath),
                          lucene.Field.Store.YES,
                          lucene.Field.Index.NOT_ANALYZED))
     doc.add(lucene.Field("path", filepath,
                          lucene.Field.Store.YES,
                          lucene.Field.Index.NOT_ANALYZED))
     if len(contents) > 0:
         title = self.getTxtAttribute(contents, 'Title')
         author = self.getTxtAttribute(contents, 'Author')
         language = self.getTxtAttribute(contents, 'Language')
         doc.add(lucene.Field("Title", title,
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("Author", author,
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("Language", language,
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("contents", contents,
                              lucene.Field.Store.NO,
                              lucene.Field.Index.ANALYZED))
     else:
         print "warning: no content in %s" % filename
     writer.addDocument(doc)
     writer.optimize()
     writer.close()
コード例 #6
0
def build_index():
    f = open(data_dir)
    reader = csv.reader(f)

    print("开始创建索引")

    indx = 0

    writer = lucene.IndexWriter(directory, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength.UNLIMITED)

    for line in reader:
        eng, zh = line[0], line[1]

        doc = lucene.Document()

        doc.add(
            lucene.Field('eng', eng, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED))
        doc.add(
            lucene.Field('zh', zh, lucene.Field.Store.YES,
                         lucene.Field.Index.NOT_ANALYZED))

        writer.addDocument(doc)

        if indx % 100000 == 0:
            print("%sK" % (indx / 1000))

        indx += 1

    print("写引擎优化")
    writer.optimize()
    writer.close()
コード例 #7
0
ファイル: IndexFiles.py プロジェクト: Riolu/Project_Set
    def indexDocs(self, root, writer):
        t = open('index.txt', "r")
        while True:  
            line = t.readline()  
            if line:
                try:
                    line = line.strip().split()  
                    url = line[0]
                    doc_name = line[1]
                    print "adding", url
                    path = os.path.join(root, doc_name)
                    f = open(path)
                    tmp = f.read() #str
                    f.close()
                    contents = tmp.decode('gbk', 'ignore')
                    soup = BeautifulSoup(contents)
                    try:
                        title = soup.title.text
                        tmp = str(title).replace('\n','')
                        title = tmp.decode('utf-8')
                    except:
                        title = "None"
                    print title

                    collection = [] #存放imgurl和对应的discription
                    dic = {}
                    p_box = soup.find(id='p-box') #处理一开始左上角一大图和下面几张小图
                    #print p_box.get('id','')
                    sub_p_box = p_box.div.nextSibling.nextSibling.nextSibling.nextSibling
                    #print sub_p_box.get('class','')
                    #print sub_p_box
                    big_pic = sub_p_box.div.div.div.img
                    dic['imgurl'] = urlparse.urljoin(url, big_pic.get('src',''))
                    dic['discription'] = big_pic.get('alt','')
                    #print dic
                    collection.append(dic) #防止重复

                    doc = lucene.Document()
                    for i in collection:
                        imgurl = i['imgurl']
                        discription = i['discription']
                        discription = " ".join(jieba.cut(discription))
                        doc.add(lucene.Field("imgurl", imgurl,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("discription", discription,
                                             lucene.Field.Store.NO,
                                             lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("url", url,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("urltitle", title,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                    writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中
                    print "----------------------------------------------------"
                except Exception, e:
                    print "Failed in indexDocs:", e
            else:
                break
コード例 #8
0
    def indexDocs(self, root, writer):
        f = open('q_index.txt', 'r')
        for line in f:
            qst_num = line[:-2]
            #        for root, dirnames, filenames in os.walk(root):
            #            for filename in filenames:
            #                if not filename.endswith('.txt'):
            #                    continue
            print "adding question", qst_num
            try:
                path = os.path.join(root, 'Question_' + qst_num, 'q.txt')
                file = open(path)
                contents_read = unicode(file.read(), 'gbk')
                file.close()

                contents = contents_read.split('\r\n|||\r\n')
                qst_name = contents[0]
                qst_detail = contents[1]
                qst_topic_blur = contents[2]
                qst_topic_accu = contents[3]
                qst_browse = contents[4]
                qst_follow = contents[5]
                qst_ans = contents[6]

                qestion = lucene.Document()
                qestion.add(
                    lucene.Field("qst_name", qst_name, lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                qestion.add(
                    lucene.Field("qst_detail", qst_detail,
                                 lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                qestion.add(
                    lucene.Field("qst_topic_blur", qst_topic_blur,
                                 lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                qestion.add(
                    lucene.Field("qst_topic_accu", qst_topic_accu,
                                 lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                qestion.add(
                    lucene.Field("qst_browse", qst_browse,
                                 lucene.Field.Store.YES,
                                 lucene.Field.Index.NOT_ANALYZED))
                qestion.add(
                    lucene.Field("qst_follow", qst_follow,
                                 lucene.Field.Store.YES,
                                 lucene.Field.Index.NOT_ANALYZED))
                qestion.add(
                    lucene.Field("qst_ans", qst_ans, lucene.Field.Store.YES,
                                 lucene.Field.Index.NOT_ANALYZED))
                qestion.add(
                    lucene.Field("qst_num", qst_num, lucene.Field.Store.YES,
                                 lucene.Field.Index.NOT_ANALYZED))
                writer.addDocument(qestion)
            except Exception, e:
                print "Failed in indexDocs:", e
コード例 #9
0
ファイル: index_ans.py プロジェクト: elicassion/Zhi-Searcher
    def indexDocs(self, root, writer):
        f = open('a_index.txt', 'r')
        for line in f:
            line_num = line[:-2]
            qst_num = line_num.split('|||')[0]
            ans_num = line_num.split('|||')[1]
            print "adding answer", ans_num
            try:
                path = os.path.join(root, 'Question_' + qst_num, 'q.txt')
                file = open(path)
                contents_read = unicode(file.read(), 'gbk')
                file.close()
                contents = contents_read.split('\r\n|||\r\n')
                qst_name = contents[0]

                path = os.path.join(root, 'Question_' + qst_num, 'Answer',
                                    ans_num + '.txt')
                file = open(path)
                contents_read = unicode(file.read(), 'gbk')
                file.close()
                contents = contents_read.split('\r\n|||\r\n')

                ans_contents = contents[0]
                if (ans_contents != 'None'):
                    ans_author = contents[1]
                    ans_like = contents[2]

                    answer = lucene.Document()
                    answer.add(
                        lucene.Field("qst_name", qst_name,
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.ANALYZED))
                    answer.add(
                        lucene.Field("ans_contents", ans_contents,
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.ANALYZED))
                    answer.add(
                        lucene.Field("ans_author", ans_author,
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.NOT_ANALYZED))
                    answer.add(
                        lucene.Field("ans_like", ans_like,
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.NOT_ANALYZED))
                    answer.add(
                        lucene.Field("qst_num", qst_num,
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.NOT_ANALYZED))
                    answer.add(
                        lucene.Field("ans_num", ans_num,
                                     lucene.Field.Store.YES,
                                     lucene.Field.Index.NOT_ANALYZED))
                    writer.addDocument(answer)
                else:
                    print "there is no contents in answer", ans_num
            except Exception, e:
                print "Failed in indexDocs:", e
コード例 #10
0
def main1():
    print "started indexing sample files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = lucene.IndexWriter(direc, config)

    #fix this later.....FieldType not defined
    #field_type=lucene.FieldType()
    #field_type.setIndexed(True)
    #field_type.setStored(False)
    #field_type.setTokenized(False)

    file1 = open("nitin.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    file1 = open("nitin2.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    writer.optimize()
    print "Indexed and optimized %d documents" % writer.numDocs()
    writer.close()
コード例 #11
0
    def indexDocs(self, root2, writer):
        root2 = unicode(root2, "utf8")
        for r, d, f in os.walk(root2):
            for dir in d:
                leibie = dir
                root3 = root2 + '\\' + dir
                for root, dirs, files in os.walk(root3):
                    for filename in files:
                        if len(filename) > 180: continue
                        path = os.path.join(root, filename)
                        f = open(path, 'r')
                        for lines in f:
                            lines = unicode(lines, 'utf-8')
                            start = lines.find(':')
                            kind = lines[0:start]
                            content = lines[start + 1::]
                            if kind == 'http':
                                url = 'http:' + content
                            elif kind == 'title':
                                title = content
                            elif kind == 'imgurl':
                                imgurl = content
                            elif kind == 'price':
                                price = content[1::]
                            #print lines[0:start]
                            #print lines
                        #print url, title, imgurl, price
                        f.close()

                        try:
                            doc = lucene.Document()
                            doc.add(
                                lucene.Field("url", url,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                            doc.add(
                                lucene.Field("title", title,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.ANALYZED))
                            doc.add(
                                lucene.Field("imgurl", imgurl,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                            doc.add(
                                lucene.Field("price", price,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                            doc.add(
                                lucene.Field("kind", leibie,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.ANALYZED))

                            writer.addDocument(doc)
                        except Exception, e:
                            print "Failed in indexDocs:", e
コード例 #12
0
 def indexDocs(self, root, writer):
     t = open('index.txt', "r")
     while True:  
         line = t.readline()  
         if line:
             try:
                 line = line.strip().split()  
                 url = line[0]
                 doc_name = line[1]
                 print "adding", url
                 path = os.path.join(root, doc_name)
                 f = open(path)
                 tmp = f.read() #str
                 f.close()
                 charset = (chardet.detect(tmp))['encoding']
                 if charset==None:
                     charset = 'utf-8'
                 #print charset
                 #contents = unicode(tmp, charset)
                 contents = tmp.decode(charset, 'ignore')
                 soup = BeautifulSoup(contents)
                 try:
                     title = soup.title.text
                     tmp = str(title).replace('\n','')
                     title = tmp.decode('utf-8')
                 except:
                     title = "None"
                 print title
                 contents = soup.get_text()
                 contents = " ".join(jieba.cut(contents))
                 #print contents
                 doc = lucene.Document()
                 doc.add(lucene.Field("name", doc_name,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 doc.add(lucene.Field("path", path,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 doc.add(lucene.Field("url", url,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 doc.add(lucene.Field("title", title,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 if len(contents) > 0:
                     doc.add(lucene.Field("contents", contents,
                                          lucene.Field.Store.NO,
                                          lucene.Field.Index.ANALYZED))
                 else:
                     print "warning: no content in %s" % doc_name
                 writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中
             except Exception, e:
                 print "Failed in indexDocs:", e    
         else:
             break
コード例 #13
0
    def UpdateIndex(self):
        # 인덱스를 최신 내용으로 갱신
        writer = lucene.IndexWriter(self.indexDir, self.analyzer, True,
                                    lucene.IndexWriter.MaxFieldLength(512))

        try:
            # DB에서 내용 가져오기
            for row in self.rows:
                doc = lucene.Document()

                doc.add(
                    lucene.Field("bookUrl", row[0], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("thumbUrl", row[1], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("price", row[2], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("title", row[3], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("subTitle", row[4], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("author", row[5], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("publisher", row[6], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("publishDate", row[7], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("offcode", row[8], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                date = str(row[9]).split('-')
                date = ''.join(date)
                print 'regDate : ' + date + ' ' + str(type(date))
                doc.add(
                    lucene.Field("regDate", date, lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))

                date = str(row[10]).split('-')
                date = ''.join(date)
                print 'updateDate : ' + date
                doc.add(
                    lucene.Field("updateDate", date, lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))

                writer.addDocument(doc)
        except Exception, e:
            print "Failed in adding index : %s" % e
            exit(1)
コード例 #14
0
    def indexWeibo(self, tid, text, user_id, created_at):
	try:
	    doc = lucene.Document()
	    doc.add(lucene.NumericField("id", 8, lucene.Field.Store.YES, True).setLongValue(long(tid)))
	    doc.add(lucene.Field("text", text,
				 lucene.Field.Store.NO,
				 lucene.Field.Index.ANALYZED))
	    doc.add(lucene.NumericField("user_id", lucene.Field.Store.YES, True).setIntValue(int(user_id)))
	    doc.add(lucene.NumericField("created_at", lucene.Field.Store.YES, True).setIntValue(created_at))
	    self.writer.addDocument(doc)
	except Exception, e:
	    print "Failed in indexWeibos:", e
コード例 #15
0
def build_index(data, num_doc):
    for word, page_num in data.items():
        doc = lucene.Document()
        doc.add(
            lucene.Field('word', word, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED))
        doc.add(
            lucene.Field('page_num', page_num, lucene.Field.Store.YES,
                         lucene.Field.Index.NOT_ANALYZED))
        writer.addDocument(doc)
        num_doc += 1
    return num_doc
コード例 #16
0
 def indexDocs(self, root, writer, startDate, endDate):
     doc_num = 0
     docindex_num = 0
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             doc_num += 1
             if doc_num % 1000 == 0:
                 print "Index Searched " + str(doc_num) + " files..."
             if not filename.endswith('.txt'):
                 continue
             filedate = datetime.strptime(filename[:8], '%Y%m%d')
             if not (filedate >= startDate and filedate <= endDate):
                 continue
             #print "adding", filename
             docindex_num += 1
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = unicode(file.read(), 'utf8')
                 file.close()
                 #替换逗号,句号为空格,并以空格为分割符切割句子
                 #print "contents:" + conteits.encode('utf8')
                 contents = contents.replace('\n', '')
                 contents = contents.replace(unicode("。", 'utf8'), '###')
                 sentence_num = 0
                 for sentence in contents.split('###'):
                     #print "sentence:" + sentence.encode("gbk",'ignore')
                     #time.sleep(1)
                     doc = lucene.Document()
                     doc.add(
                         lucene.Field("name", filename,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                     doc.add(
                         lucene.Field("path", path, lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                     doc.add(
                         lucene.Field("sentence_num", str(sentence_num),
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                     if len(sentence) > 0:
                         doc.add(
                             lucene.Field("sentence", sentence,
                                          lucene.Field.Store.YES,
                                          lucene.Field.Index.ANALYZED))
                     else:
                         print "warning: no content in sentence %d of file %s" % sentence_num, filename
                     writer.addDocument(doc)
                     sentence_num += 1
             except Exception, e:
                 #print "Failed in indexDocs:", e
                 error = 1
コード例 #17
0
 def store_terms(self, session, index, terms, rec):
     strm = C3TokenStream(terms)
     if rec != self.currRec:
         if self.currDoc:
             # write it
             self.writer.addDocument(self.currDoc)
         doc = lucene.Document()
         self.currDoc = doc
         doc.add(lucene.Field(index.id, strm))
         doc.add(
             lucene.Field('id', str(rec), lucene.Field.Store.YES,
                          lucene.Field.Index.UN_TOKENIZED))
     else:
         doc.add(lucene.Field(index.id, strm))
コード例 #18
0
def index_files(board, time_delta):
    store = lucene.SimpleFSDirectory(
        lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX))
    writer = lucene.IndexWriter(
        store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True,
        lucene.IndexWriter.MaxFieldLength.UNLIMITED)
    #  writer.setMaxFieldLength(1048576) # 1MB

    flist = get_all_files(board, time_delta)
    for filename, owner, title in flist:
        path = BOARDSPATH + board + '/' + filename
        if not os.path.exists(path):
            continue

        f = open(path, 'r')
        contents = filter_file(f)
        debug(contents)
        try:
            title = title.decode('gbk')
            owner = owner.decode('gbk')
            contents = unicode(contents, 'gbk')
        except UnicodeDecodeError:
            f.close()
            debug(filename)
            continue
        f.close()

        if len(contents) > 0:
            doc = lucene.Document()
            doc.add(
                lucene.Field("name", filename, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("owner", owner, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("title", title, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("contents", contents, lucene.Field.Store.NO,
                             lucene.Field.Index.ANALYZED))
            writer.addDocument(doc)
            debug('adding ' + filename)
    writer.optimize()
    writer.close()
コード例 #19
0
 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = file.read().decode('utf8', 'ignore')
                 file.close()
                 doc = lucene.Document()
                 doc.add(
                     lucene.Field("name", filename, lucene.Field.Store.YES,
                                  lucene.Field.Index.NOT_ANALYZED))
                 doc.add(
                     lucene.Field("path", path, lucene.Field.Store.YES,
                                  lucene.Field.Index.NOT_ANALYZED))
                 if len(contents) > 0:
                     title = self.getTxtAttribute(contents, 'Title')
                     author = self.getTxtAttribute(contents, 'Author')
                     language = self.getTxtAttribute(contents, 'Language')
                     doc.add(
                         lucene.Field("title", title,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.ANALYZED))
                     doc.add(
                         lucene.Field("author", author,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.ANALYZED))
                     doc.add(
                         lucene.Field("language", language,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.ANALYZED))
                     doc.add(
                         lucene.Field("contents", contents,
                                      lucene.Field.Store.NO,
                                      lucene.Field.Index.ANALYZED))
                 else:
                     print "warning: no content in %s" % filename
                 writer.addDocument(doc)
             except Exception, e:
                 print "Failed in indexDocs:", e
コード例 #20
0
def get_word_list(text, is_list=False, field_name = 'fieldname'):
    if is_list:
        new_text = ""
        for i in text:
            new_text += i + "\n"
        text = new_text

    lucene.initVM(lucene.CLASSPATH)
    analyzer = lucene.KoreanAnalyzer();

    #directory = lucene.FSDirectory.open("/tmp/testindex");
    directory = lucene.RAMDirectory()

    # writer
    writer = lucene.IndexWriter(directory, analyzer)
    doc = lucene.Document()

    doc.add(lucene.Field(field_name, text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED));
    writer.addDocument(doc);
    writer.close();

    # get all terms from all index
    ireader = lucene.IndexReader.open(directory, False)
    term = lucene.Term(field_name, '')
    termenum = ireader.terms(term)
    term = termenum.term()
    i = 0

    word_list = []

    while term and term.field() == field_name:
        i += 1
        termDocs = ireader.termDocs(term)
        termDocs.next()
        #print "[%04d]===> <%s> " % (i, term.text())
        #print term.text() + " : " + str(termDocs.freq())
        word_list.append({'text': term.text(), 'freq': termDocs.freq()})
        term = termenum.next() and termenum.term()

    ireader.close();
    directory.close();

    return word_list
コード例 #21
0
def	IndexCreate(fileDir, indexDir):
	analyzer = lucene.StandardAnalyzer()	# 루씬에서 사용하는 객체 생성
	store = lucene.FSDirectory.getDirectory(indexDir)
	writer = lucene.IndexWriter(store, analyzer)

	for root, dirnames, filenames in os.walk(fileDir):	# 입력받은 폴더에서 텍스트 파일만 검색
		for filename in filenames:
			if not filename.endswith('.txt'):
				continue
			
			print("Adding: %s" % filename)
			try:
				path = os.path.join(root, filename)
				f = open(path)
				content = f.read()
				f.close()

				content = content.decode('cp949').encode('utf-8')	# 인코딩을 'utf-8'로 변경

				doc = lucene.Document()				# Document 객체 추가
				doc.add(lucene.Field(	"name", 	# 파일명
										filename,
										lucene.Field.Store.YES,
										lucene.Field.Index.NO))
				doc.add(lucene.Field(	"path", 	# 파일 경로
										path,
										lucene.Field.Store.YES,
										lucene.Field.Index.NO))
				if len(content) > 0:
					doc.add(lucene.Field(	"content", 		# 파일 내용
											content,
											lucene.Field.Store.NO,
											lucene.Field.Index.TOKENIZED))
				else:
					print("Warning: No contents in %s" % filename)
				writer.addDocument(doc)				# 인덱스에 Document 추가
			except Exception, e:
				print("Failed in adding index: %s" % e)
コード例 #22
0
	def UpdateIndex(self):
		"인덱스를 최신의 내용으로 갱신"
		self.lastIndexingTime = self.__ReadLatestUpdateTime()	# 마지막으로 인덱스한 시간(None-인덱스한 적이 없음)
		writer = lucene.IndexWriter(self.store, self.analyzer, lucene.IndexWriter.MaxFieldLength(1048576))

		for root, dirnames, filenames in os.walk(self.blogDir):
			for filename in filenames:
				if not filename.endswith('.txt'):	# txt 파일이 아닌 경우 인덱스하지 않음	
					continue	

				path = os.path.join(root, filename)
				if (self.lastIndexingTime != None and self.lastIndexingTime >= int(os.stat(path).st_mtime)):
					continue		# 이미 인덱스에 추가된 데이터인 경우

				print("Adding: %s" % filename)
				try:
					f = open(path)
					content = f.read()
					f.close()

					doc = lucene.Document()
					doc.add(lucene.Field(	"bloger", 
											path.rsplit("\\", 2)[1],		# 파일이 들어있는 디렉토리를 블로거로 설정
											lucene.Field.Store.YES,
											lucene.Field.Index.UN_TOKENIZED))
					doc.add(lucene.Field(	"path", 
											path,
											lucene.Field.Store.YES,
											lucene.Field.Index.UN_TOKENIZED))
					doc.add(lucene.Field(	"contents", 
											content,
											lucene.Field.Store.NO,
											lucene.Field.Index.TOKENIZED))
					writer.addDocument(doc)
				except Exception, e:
					print("Failed in adding index: %s" % e)
コード例 #23
0
ファイル: index.py プロジェクト: yangtina/reviewboard
    def index_review_request(self, writer, request):
        if lucene_is_2x:
            lucene_tokenized = lucene.Field.Index.TOKENIZED
            lucene_un_tokenized = lucene.Field.Index.UN_TOKENIZED
        elif lucene_is_3x:
            lucene_tokenized = lucene.Field.Index.ANALYZED
            lucene_un_tokenized = lucene.Field.Index.NOT_ANALYZED
        else:
            assert False

        # There are several fields we want to make available to users.
        # We index them individually, but also create a big hunk of text
        # to use for the default field, so people can just type in a
        # string and get results.
        doc = lucene.Document()
        doc.add(lucene.Field('id', str(request.id),
                             lucene.Field.Store.YES,
                             lucene.Field.Index.NO))
        doc.add(lucene.Field('summary', request.summary,
                             lucene.Field.Store.NO,
                             lucene_tokenized))
        if request.changenum:
            doc.add(lucene.Field('changenum',
                                 unicode(request.changenum),
                                 lucene.Field.Store.NO,
                                 lucene_tokenized))
        # Remove commas, since lucene won't tokenize it right with them
        bugs = ' '.join(request.bugs_closed.split(','))
        doc.add(lucene.Field('bug', bugs,
                             lucene.Field.Store.NO,
                             lucene_tokenized))

        name = ' '.join([request.submitter.username,
                         request.submitter.get_full_name()])
        doc.add(lucene.Field('author', name,
                             lucene.Field.Store.NO,
                             lucene_tokenized))
        doc.add(lucene.Field('username', request.submitter.username,
                             lucene.Field.Store.NO,
                             lucene_un_tokenized))

        # FIXME: index reviews
        # FIXME: index dates

        files = []
        if request.diffset_history:
            for diffset in request.diffset_history.diffsets.all():
                for filediff in diffset.files.all():
                    if filediff.source_file:
                        files.append(filediff.source_file)
                    if filediff.dest_file:
                        files.append(filediff.dest_file)
        aggregate_files = '\n'.join(set(files))
        # FIXME: this tokenization doesn't let people search for files
        # in a really natural way.  It'll split on '/' which handles the
        # majority case, but it'd be nice to be able to drill down
        # (main.cc, vmuiLinux/main.cc, and player/linux/main.cc)
        doc.add(lucene.Field('file', aggregate_files,
                             lucene.Field.Store.NO,
                             lucene_tokenized))

        text = '\n'.join([request.summary,
                          request.description,
                          unicode(request.changenum),
                          request.testing_done,
                          bugs,
                          name,
                          aggregate_files])
        doc.add(lucene.Field('text', text,
                             lucene.Field.Store.NO,
                             lucene_tokenized))
        writer.addDocument(doc)
コード例 #24
0
def index_ontology_files(oboFile, outDir, xref_map):
    """
    Iterates over our list of ontology files and creates an index for each file.
    """
    lucene.initVM()
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Handle a little bit of lucene setup
    filename, _ext = os.path.splitext(os.path.basename(oboFile))

    indexDir = os.path.join(outDir, filename)
    if os.path.exists(indexDir):
        raise ExistingIndexDirectoryException(
            'Error, attempted to index same file twice or index two files named the same'
        )

    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
    writer = lucene.IndexWriter(dir, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    for term in oboparser.parse(oboFile, ['is_a']):
        if term.obsolete:
            continue

        doc = lucene.Document()
        add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED)
        add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 4.0)

        # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could
        # query hits that we would not want to occur thus errantly increasing the score of the field.
        # We will strip out these hyperlinks and index just the text.
        add_field_to_document(doc, "definition",
                              strip_urls_from_text(term.definition),
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 0.4)

        # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists
        # in our Ontology object and need to be entered in one at a time
        add_fields_to_document(doc, "synonym",
                               [x[0] for x in term.synonyms if x],
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED, 0.7)

        add_fields_to_document(doc, "alt_id", term.alternateIds,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "xref",
            [replace_xref_identifier(x, xref_map) for x in term.xrefs],
            lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "relationship",
            [" ".join(list(x)) for x in list(term.relationships)],
            lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)
        add_fields_to_document(doc, "subset", term.subsets,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        writer.addDocument(doc)

    writer.optimize()
    writer.close()
コード例 #25
0
    def addMessage(self, username, xprotocol, xfriend_chat, who_sent,
                   timestamp, text):
        #Clean up protocol and friend_chat fields
        """ For some unknown reason, PyLucene (and probably Lucene as well)
            seems to have problems searching for things like SoAndSo but
            has no problems searching for soandso. To prevent headaches in
            the future we simply set it all to lowercase since the case
            does not matter for these fields."""
        protocol = xprotocol.lower()
        friend_chat = xfriend_chat.lower()

        #Determine index and data paths
        index_dir = self.indexdir + username
        data_dir = self.datadir + username + PATH_SEP + protocol + PATH_SEP
        data_file = data_dir + friend_chat

        #if the index doesn't exist, we use a sepcial constructor to create it
        if os.path.isdir(index_dir) == False:
            os.makedirs(index_dir)
            luc_index = lucene.FSDirectory.getDirectory(index_dir, True)
            luc_writer = lucene.IndexWriter(luc_index,
                                            lucene.StandardAnalyzer(), True)
        else:
            luc_index = lucene.FSDirectory.getDirectory(index_dir)
            luc_writer = lucene.IndexWriter(luc_index,
                                            lucene.StandardAnalyzer())
        #Opening the index before writing to the file gives us a lock
        #on the index. As long as writing to data files occurs only
        #through this function, this is guaranteed to be an atomic
        #operation. Closing the writer releases the lock.

        if os.path.isdir(data_dir) == False:
            os.makedirs(data_dir)
        #filesize is used to determine the file offset
        if os.path.isfile(data_file) == False:
            filesize = 0
        else:
            filesize = os.path.getsize(data_file)

        datahandle = open(data_file, 'a')
        datahandle.write(str(who_sent))
        datahandle.write("\n")
        datahandle.write(str(timestamp))
        datahandle.write("\n")
        datahandle.write(str(len(str(text))))  #what a mess
        datahandle.write("\n")
        datahandle.write(str(text))
        datahandle.write("\n")

        doc = lucene.Document()
        doc.add(self.__makeKeywordField('protocol', str(protocol)))
        doc.add(self.__makeKeywordField('friend_chat', str(friend_chat)))
        clean_timestamp = self.__padTimestamp(timestamp)
        doc.add(self.__makeKeywordField('timestamp', clean_timestamp))
        doc.add(self.__makeKeywordField('who_sent', str(who_sent)))
        doc.add(self.__makeUnIndexedField('file_offset', str(filesize)))
        clean_text = re.sub("<[^>]*>", " ", str(text))
        doc.add(self.__makeUnStoredField('text', clean_text))

        luc_writer.addDocument(doc)
        luc_writer.close()
コード例 #26
0
ファイル: index.py プロジェクト: Andrewpqc/search_engine_app
    lucene.initVM()

    print("lucene version is:", lucene.VERSION)

    # Get the analyzer
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Get index storage
    store = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))

    # Get index writer
    writer = lucene.IndexWriter(store, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength.LIMITED)

    try:
        # create a document that would we added to the index
        doc = lucene.Document()

        # Add a field to this document
        field = lucene.Field("titlendia", lucene.Field.Store.YES,
                             lucene.Field.Index.ANALYZED)

        # Add this field to the document
        doc.add(field)

        # Add the document to the index
        writer.addDocument(doc)

    except Exception as e:
        print("Failed in indexDocs:", e)