Beispiel #1
0
def add_field_to_document(document,
                          field_name,
                          value,
                          store,
                          analyzed,
                          boost=None):
    """
    Adds a field to the passed in Lucene document. If the boost kwarg is passed
    in the field will be boosted by the specified value.
    """
    field = lucene.Field(field_name, value, store, analyzed)

    if boost:
        field.setBoost(boost)

    document.add(field)
Beispiel #2
0
def get_word_list(text, is_list=False, field_name = 'fieldname'):
    if is_list:
        new_text = ""
        for i in text:
            new_text += i + "\n"
        text = new_text

    lucene.initVM(lucene.CLASSPATH)
    analyzer = lucene.KoreanAnalyzer();

    #directory = lucene.FSDirectory.open("/tmp/testindex");
    directory = lucene.RAMDirectory()

    # writer
    writer = lucene.IndexWriter(directory, analyzer)
    doc = lucene.Document()

    doc.add(lucene.Field(field_name, text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED));
    writer.addDocument(doc);
    writer.close();

    # get all terms from all index
    ireader = lucene.IndexReader.open(directory, False)
    term = lucene.Term(field_name, '')
    termenum = ireader.terms(term)
    term = termenum.term()
    i = 0

    word_list = []

    while term and term.field() == field_name:
        i += 1
        termDocs = ireader.termDocs(term)
        termDocs.next()
        #print "[%04d]===> <%s> " % (i, term.text())
        #print term.text() + " : " + str(termDocs.freq())
        word_list.append({'text': term.text(), 'freq': termDocs.freq()})
        term = termenum.next() and termenum.term()

    ireader.close();
    directory.close();

    return word_list
Beispiel #3
0
 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = file.read().decode('utf8', 'ignore')
                 file.close()
                 doc = lucene.Document()
                 doc.add(
                     lucene.Field("name", filename, lucene.Field.Store.YES,
                                  lucene.Field.Index.NOT_ANALYZED))
                 doc.add(
                     lucene.Field("path", path, lucene.Field.Store.YES,
                                  lucene.Field.Index.NOT_ANALYZED))
                 if len(contents) > 0:
                     title = self.getTxtAttribute(contents, 'Title')
                     author = self.getTxtAttribute(contents, 'Author')
                     language = self.getTxtAttribute(contents, 'Language')
                     doc.add(
                         lucene.Field("title", title,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.ANALYZED))
                     doc.add(
                         lucene.Field("author", author,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.ANALYZED))
                     doc.add(
                         lucene.Field("language", language,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.ANALYZED))
                     doc.add(
                         lucene.Field("contents", contents,
                                      lucene.Field.Store.NO,
                                      lucene.Field.Index.ANALYZED))
                 else:
                     print "warning: no content in %s" % filename
                 writer.addDocument(doc)
             except Exception, e:
                 print "Failed in indexDocs:", e
Beispiel #4
0
    def index_review_request(self, writer, request):
        if lucene_is_2x:
            lucene_tokenized = lucene.Field.Index.TOKENIZED
            lucene_un_tokenized = lucene.Field.Index.UN_TOKENIZED
        elif lucene_is_3x:
            lucene_tokenized = lucene.Field.Index.ANALYZED
            lucene_un_tokenized = lucene.Field.Index.NOT_ANALYZED
        else:
            assert False

        # There are several fields we want to make available to users.
        # We index them individually, but also create a big hunk of text
        # to use for the default field, so people can just type in a
        # string and get results.
        doc = lucene.Document()
        doc.add(lucene.Field('id', str(request.id),
                             lucene.Field.Store.YES,
                             lucene.Field.Index.NO))
        doc.add(lucene.Field('summary', request.summary,
                             lucene.Field.Store.NO,
                             lucene_tokenized))
        if request.changenum:
            doc.add(lucene.Field('changenum',
                                 unicode(request.changenum),
                                 lucene.Field.Store.NO,
                                 lucene_tokenized))
        # Remove commas, since lucene won't tokenize it right with them
        bugs = ' '.join(request.bugs_closed.split(','))
        doc.add(lucene.Field('bug', bugs,
                             lucene.Field.Store.NO,
                             lucene_tokenized))

        name = ' '.join([request.submitter.username,
                         request.submitter.get_full_name()])
        doc.add(lucene.Field('author', name,
                             lucene.Field.Store.NO,
                             lucene_tokenized))
        doc.add(lucene.Field('username', request.submitter.username,
                             lucene.Field.Store.NO,
                             lucene_un_tokenized))

        # FIXME: index reviews
        # FIXME: index dates

        files = []
        if request.diffset_history:
            for diffset in request.diffset_history.diffsets.all():
                for filediff in diffset.files.all():
                    if filediff.source_file:
                        files.append(filediff.source_file)
                    if filediff.dest_file:
                        files.append(filediff.dest_file)
        aggregate_files = '\n'.join(set(files))
        # FIXME: this tokenization doesn't let people search for files
        # in a really natural way.  It'll split on '/' which handles the
        # majority case, but it'd be nice to be able to drill down
        # (main.cc, vmuiLinux/main.cc, and player/linux/main.cc)
        doc.add(lucene.Field('file', aggregate_files,
                             lucene.Field.Store.NO,
                             lucene_tokenized))

        text = '\n'.join([request.summary,
                          request.description,
                          unicode(request.changenum),
                          request.testing_done,
                          bugs,
                          name,
                          aggregate_files])
        doc.add(lucene.Field('text', text,
                             lucene.Field.Store.NO,
                             lucene_tokenized))
        writer.addDocument(doc)
Beispiel #5
0
    def UpdateIndex(self):
        # 인덱스를 최신 내용으로 갱신
        writer = lucene.IndexWriter(self.indexDir, self.analyzer, True,
                                    lucene.IndexWriter.MaxFieldLength(512))

        try:
            # DB에서 내용 가져오기
            for row in self.rows:
                doc = lucene.Document()

                doc.add(
                    lucene.Field("bookUrl", row[0], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("thumbUrl", row[1], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("price", row[2], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("title", row[3], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("subTitle", row[4], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("author", row[5], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("publisher", row[6], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                doc.add(
                    lucene.Field("publishDate", row[7], lucene.Field.Store.YES,
                                 lucene.Field.Index.NO))
                doc.add(
                    lucene.Field("offcode", row[8], lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))
                date = str(row[9]).split('-')
                date = ''.join(date)
                print 'regDate : ' + date + ' ' + str(type(date))
                doc.add(
                    lucene.Field("regDate", date, lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))

                date = str(row[10]).split('-')
                date = ''.join(date)
                print 'updateDate : ' + date
                doc.add(
                    lucene.Field("updateDate", date, lucene.Field.Store.YES,
                                 lucene.Field.Index.ANALYZED))

                writer.addDocument(doc)
        except Exception, e:
            print "Failed in adding index : %s" % e
            exit(1)
Beispiel #6
0
 def __makeUnStoredField(self, fieldname, fielddata):
     return lucene.Field(fieldname, fielddata, lucene.Field.Store.NO,
                         lucene.Field.Index.TOKENIZED)
Beispiel #7
0
 def __makeUnIndexedField(self, fieldname, fielddata):
     return lucene.Field(fieldname, fielddata, lucene.Field.Store.YES,
                         lucene.Field.Index.NO)
Beispiel #8
0
 def __makeKeywordField(self, fieldname, fielddata):
     return lucene.Field(fieldname, fielddata, lucene.Field.Store.YES,
                         lucene.Field.Index.UN_TOKENIZED)
Beispiel #9
0
    lucene.initVM()

    print("lucene version is:", lucene.VERSION)

    # Get the analyzer
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Get index storage
    store = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))

    # Get index writer
    writer = lucene.IndexWriter(store, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength.LIMITED)

    try:
        # create a document that would we added to the index
        doc = lucene.Document()

        # Add a field to this document
        field = lucene.Field("titlendia", lucene.Field.Store.YES,
                             lucene.Field.Index.ANALYZED)

        # Add this field to the document
        doc.add(field)

        # Add the document to the index
        writer.addDocument(doc)

    except Exception as e:
        print("Failed in indexDocs:", e)
Beispiel #10
0
 def indexDocs(self, root, writer):
     t = open('index.txt', "r")
     while True:  
         line = t.readline()  
         if line:
             try:
                 line = line.strip().split()  
                 url = line[0]
                 doc_name = line[1]
                 print "adding", url
                 path = os.path.join(root, doc_name)
                 f = open(path)
                 tmp = f.read() #str
                 f.close()
                 try:
                     domain = get_tld(url)
                 except:
                     domain = "Unknown"
                 print domain
                 charset = (chardet.detect(tmp))['encoding']
                 if charset==None:
                     charset = 'utf-8'
                 #print charset
                 #contents = unicode(tmp, charset)
                 contents = tmp.decode(charset, 'ignore')
                 soup = BeautifulSoup(contents)
                 try:
                     title = soup.title.text
                     tmp = str(title).replace('\n','')
                     title = tmp.decode('utf-8')
                 except:
                     title = "None"
                 print title
                 contents = soup.get_text()
                 contents = " ".join(jieba.cut(contents))
                 #print contents
                 doc = lucene.Document()
                 doc.add(lucene.Field("name", doc_name,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 doc.add(lucene.Field("path", path,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 doc.add(lucene.Field("url", url,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 doc.add(lucene.Field("title", title,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.NOT_ANALYZED))
                 doc.add(lucene.Field("site", domain,
                                      lucene.Field.Store.YES,
                                      lucene.Field.Index.ANALYZED))
                 if len(contents) > 0:
                     doc.add(lucene.Field("contents", contents,
                                          lucene.Field.Store.NO,
                                          lucene.Field.Index.ANALYZED))
                 else:
                     print "warning: no content in %s" % doc_name
                 writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中
             except Exception, e:
                 print "Failed in indexDocs:", e    
         else:
             break
Beispiel #11
0
    def indexDocs(self, root, writer):
        t = open('index.txt', "r")
        while True:  
            line = t.readline()  
            if line:
                try:
                    line = line.strip().split()  
                    url = line[0]
                    doc_name = line[1]
                    print "adding", url
                    path = os.path.join(root, doc_name)
                    f = open(path)
                    tmp = f.read() #str
                    f.close()
                    contents = tmp.decode('gbk', 'ignore')
                    soup = BeautifulSoup(contents)
                    try:
                        title = soup.title.text
                        tmp = str(title).replace('\n','')
                        title = tmp.decode('utf-8')
                    except:
                        title = "None"
                    print title

                    collection = [] #存放imgurl和对应的discription
                    dic = {}
                    p_box = soup.find(id='p-box') #处理一开始左上角一大图和下面几张小图
                    #print p_box.get('id','')
                    sub_p_box = p_box.div.nextSibling.nextSibling.nextSibling.nextSibling
                    #print sub_p_box.get('class','')
                    #print sub_p_box
                    big_pic = sub_p_box.div.div.div.img
                    dic['imgurl'] = urlparse.urljoin(url, big_pic.get('src',''))
                    dic['discription'] = big_pic.get('alt','')
                    #print dic
                    collection.append(dic)

                    small_pic_group = big_pic.parent.nextSibling.nextSibling.div.ul
                    for i in small_pic_group.findAll('li'):
                        small_pic = i.img
                        dic['imgurl'] = urlparse.urljoin(url, small_pic.get('src',''))
                        dic['discription'] = " ".join(jieba.cut(small_pic.get('alt','')))
                        #print dic
                        collection.append(dic)

                    doc = lucene.Document()
                    for i in collection:
                        imgurl = i['imgurl']
                        discription = i['discription']
                        doc.add(lucene.Field("imgurl", imgurl,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("discription", discription,
                                             lucene.Field.Store.NO,
                                             lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("url", url,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("urltitle", title,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                    writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中
                    print "----------------------------------------------------"
                except Exception, e:
                    print "Failed in indexDocs:", e
            else:
                break
    lucene.initVM()
    indexDir = "D:/Downloads/index"
    dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = lucene.IndexWriter(dir_, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    print("Currently there are %d documents in the index..." %
          writer.numDocs())

    content = (
        "Strategische Konzeption, Umsetzung und Betreuung von langfristig " +
        "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.")
    doc = lucene.Document()
    doc.add(
        lucene.Field("content", content, lucene.Field.Store.YES,
                     lucene.Field.Index.ANALYZED))
    doc.add(
        lucene.Field("filePath", "Projekte/bericht.txt",
                     lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
    writer.addDocument(doc)

    content = (
        "Design von Marken, Screens und Interfaces sowie Entwicklung von " +
        "individuellen Facebook Apps, iPhone Apps und Webauftritten.")
    doc = lucene.Document()
    doc.add(
        lucene.Field("content", content, lucene.Field.Store.YES,
                     lucene.Field.Index.ANALYZED))
    doc.add(
        lucene.Field("filePath", "Projekte/implementierung.txt",
                     lucene.Field.Store.YES, Field.Index.ANALYZED))