Example #1
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setStoreTermVectors(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPositions(True)
        t2.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

        file_path = root + 'r52-train-all-terms.txt'
        fd = open(file_path)
        contents = fd.readlines()
        fd.close()
        contents_list = [x.strip() for x in contents]
        for i in xrange(len(contents_list)):
            try:
                [topic, content] = contents_list[i].split('\t')
                doc = Document()
                doc.add(Field("id", str(i), t1))
                doc.add(Field("topic", topic, t1))
                doc.add(Field("contents", content, t2))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Example #2
0
def wikipedia_indexer(storage, wikipedia_file):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    f = open(wikipedia_file)

    for i, line in enumerate(f):
        text = line.strip().decode('utf-8').split('\t')
        title = text[0]
        if 'disambigu' in text[0] or len(text) < 2:
            continue
        text = text[1]
        doc = Document()
        doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
        writer.addDocument(doc)
        if writer.numDocs() % 1000 == 0:
            print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)

    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Example #3
0
    def indexsents(self, sentences, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for i, sent in enumerate(sentences):
            # print 'adding',i, sent
            try:
                root = os.getcwd()
                # contents = unicode(sent, 'iso-8859-1')
                doc = Document()
                doc.add(Field('name', str(i), t1))
                doc.add(Field('path', root, t1))

                if len(sent) > 0:
                    doc.add(Field('contents', sent.lower(), t2))
                else:
                    print('warning: no content in %s' % str(i))

                writer.addDocument(doc)
            except Exception as e:
                print('Failed in indexsents:', e)

        writer.commit()
        writer.close()
    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            if prm.top_tfidf > 0:
                words_idx = []
                words, _ = utils.top_tfidf(txt.lower(), self.idf,
                                           prm.top_tfidf, prm.min_term_freq)

                if len(words) == 0:
                    words.append('unk')

                for w in words:
                    if w in self.vocab:
                        words_idx.append(self.vocab[w])
                    else:
                        words_idx.append(-1)  # unknown words.

            else:
                txt_ = txt.lower()
                words_idx, words = utils.text2idx2([txt_], self.vocab,
                                                   prm.max_terms_per_doc)
                words_idx = words_idx[0]
                words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)
Example #5
0
 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()
Example #6
0
    def index_image(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        with open(os.path.join(root, "index.txt"), mode="r",
                  encoding="utf8") as index:
            count = 1
            for line in index:
                print("\r", count, end="", sep="")
                try:
                    image_url, content = line.strip().split()[:2]
                except ValueError as e:
                    print(e)
                    continue
                doc = Document()
                doc.add(Field("raw_content", content, t1))
                content = " ".join(
                    word for word in jieba.cut_for_search(content)
                    if word.strip() and word not in self.stop_words)

                doc.add(Field("url", image_url, t1))
                doc.add(Field("content", content, t2))
                writer.addDocument(doc)
                count += 1
            print("\n{count} image(s) added.".format(count=count))
def indexMovie(movie):
    doc = Document()
    doc.add(Field('id', str(movie), StringField.TYPE_STORED))
    at_lest_one_field = False

    maybe_tags = movies_tags.query('item == @movie')
    if not maybe_tags.empty:
        tags = maybe_tags[['tags']].values.flatten()[0]
        doc.add(Field('tags', tags, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    maybe_description = movies_descriptions.query('item == @movie')
    if not maybe_description.empty:
        description = maybe_description[['description']].values.flatten()[0]
        doc.add(Field('description', description, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    maybe_genres = movies_genres.query('item == @movie')
    if not maybe_genres.empty:
        genres = maybe_genres[['genres']].values.flatten()[0]
        doc.add(Field('genres', genres, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    if at_lest_one_field:
        writer.addDocument(doc)
def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))
Example #9
0
    def indexer(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        def repalcer(text):
            chars = '\\`*_{}[]()>#+-.!$‘'
            for c in chars:
                if c in text:
                    text = text.replace(c, ' ')
            return text

        for root, dirnames, filenames in os.walk(root):
            i = 0
            for filename in filenames:
                i += 1
                with open(os.path.join(root, filename)) as f:
                    for line in f.readlines():
                        line = line.split(' ', 2)
                        docname = line[0] + ' ' + line[1]
                        name = repalcer(line[0])
                        contents = line[2]
                        doc = Document()
                        doc.add(Field('docname', docname, t1))
                        doc.add(Field('name', name, t1))
                        doc.add(Field('contents', contents, t1))
                        writer.addDocument(doc)
                print('File %d done indexing' % i)
Example #10
0
    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            # traverse through the doc directory
            for filename in filenames:
                #	if not filename.endswith('.cdc'):
                #		continue
                try:
                    # only add the filename and path for indexing
                    path = os.path.join(root, filename)
                    print "adding file : ", path
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in ", filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "failed in indexDocs:", e
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        inFile = open(str(args["inputFile"]))
        indexName = inFile.readline()
        while (indexName != ''):
            print "adding", indexName
            doc = Document()
            doc.add(Field("name", indexName, t1))
            #doc.add(Field("path", root, t1))
            text = inFile.readline()
            if (len(text) > 0):
                print("contents: %s\n" % text)
                doc.add(Field("contents", text, t2))
            else:
                print "warning: no content in %s" % indexName
            indexName = inFile.readline()
            writer.addDocument(doc)
        inFile.close()
    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt', 'r', encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1]
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url, title, alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(
                Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(
                Field("url", picDict[src][0], Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("title", picDict[src][1], Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("alt", picDict[src][2], Field.Store.YES,
                      Field.Index.ANALYZED))
            writer.addDocument(doc)
Example #13
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (
        n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Example #14
0
    def build(self, index):

        writer = self.getWriter(directory=index.index,
                                analyzer=SimpleAnalyzer(
                                    Version.LUCENE_CURRENT))

        seed(101)
        for d in xrange(self.minId, self.maxId + 1):
            doc = Document()
            doc.add(Field("id", self.pad(d), StringField.TYPE_STORED))
            if index.allowNegativeRandomInts:
                r = randint(~self.MAX_INT, self.MAX_INT)
            else:
                r = randint(0, self.MAX_INT)

            if index.maxR < r:
                index.maxR = r

            if r < index.minR:
                index.minR = r

            doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED))
            doc.add(Field("body", "body", StringField.TYPE_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
Example #15
0
    def Indexing(self, writer):
        print("Indexing Segmented File [", SEGMENTATION_FILE, "]")
        with open(SEGMENTATION_FILE, 'r') as f:
            line_count = 0
            for line in f:
                # 建立 context 的 fieldtype,需要搭建索引、存储、向量化
                fieldtype_context = FieldType()
                fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                fieldtype_context.setStored(True)
                fieldtype_context.setTokenized(True)

                # 建立 phrase 的 fieldtype,只需要保存
                fieldtype_phrase = FieldType()
                fieldtype_phrase.setStored(True)

                # 对分词好的内容进行处理,把词语和词性分开来存储
                processed_context, processed_phrase = self.process_line(line)

                doc = Document()
                # context field是用于记录文章的内容
                doc.add(Field('context', processed_context, fieldtype_context))
                # phrase field适用于记录文章每个词所对应的词性
                doc.add(Field('phrase', processed_phrase, fieldtype_phrase))

                # 把document写入索引库
                writer.addDocument(doc)

                # 跟踪程序运行情况用
                print("\r", str(line_count), " lines", end="", flush=True)
                line_count = line_count + 1
                if line_count > self.index_limit and not self.training:
                    break

        writer.close()
        print()
Example #16
0
    def setUp(self):
        super(PyLuceneThreadTestCase, self).setUp()

        self.classLoader = Thread.currentThread().getContextClassLoader()

        writer = self.getWriter(analyzer=StandardAnalyzer())

        doc1 = Document()
        doc2 = Document()
        doc3 = Document()
        doc4 = Document()
        doc1.add(Field("field", "one", TextField.TYPE_STORED))
        doc2.add(Field("field", "two", TextField.TYPE_STORED))
        doc3.add(Field("field", "three", TextField.TYPE_STORED))
        doc4.add(Field("field", "one", TextField.TYPE_STORED))

        writer.addDocument(doc1)
        writer.addDocument(doc2)
        writer.addDocument(doc3)
        writer.addDocument(doc4)
        writer.commit()
        writer.close()

        self.testData = [('one', 2), ('two', 1), ('three', 1),
                         ('five', 0)] * 500
        self.lock = threading.Lock()
        self.totalQueries = 0
Example #17
0
    def build_index(self, dict_data):
        print("loading data...")
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for k, v in dict_data.items():
            doc = Document()
            doc.add(Field("id", k, t1))
            doc.add(Field("content", v, t2))
            self.writer.addDocument(doc)

        ticker = Ticker()
        print("commit index")
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        self.writer.close()
        ticker.tick = False
        print("done")
Example #18
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'gbk')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
Example #19
0
    def indexsents(self, sentences, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for i, sent in enumerate(sentences):
            #print "adding",i, sent
            try:
                root = os.getcwd()
                #contents = unicode(sent, 'iso-8859-1')
                doc = Document()
                doc.add(Field("name", str(i), t1))
                doc.add(Field("path", root, t1))
                if len(sent) > 0:
                    doc.add(Field("contents", sent.lower(), t2))
                else:
                    print "warning: no content in %s" % str(i)
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexsents:", e
Example #20
0
def get_document(fname, split_by=None):
    docs = []
    _name = os.path.split(fname)[-1]
    with open(fname) as f:
        contents = f.read()
        if split_by:
            paragraphs = contents.split(split_by)
            for ix, par in enumerate(paragraphs):
                if not par:
                    continue
                doc = Document()
                name = "{}_{}".format(_name, ix)
                doc.add(
                    Field('filename', name, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                doc.add(
                    Field('content', par, Field.Store.YES,
                          Field.Index.ANALYZED))
                docs.append(doc)
        else:
            doc = Document()
            doc.add(
                Field('filename', _name, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field('content', contents, Field.Store.YES,
                      Field.Index.ANALYZED))
            docs.append(doc)
    return docs
Example #21
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        index_file = open("index.txt", 'r')
        for line in index_file.readlines():

            try:
                src = line.strip().split('\t')[0]
                filename = line.strip().split('\t')[1]
                tag = line.strip().split('\t')[2]
                path = os.path.join(root, filename)

                doc = Document()
                doc.add(Field("name", filename, t1))
                doc.add(Field("path", root, t1))
                doc.add(Field("src", src, t1))

                if len(tag) > 0:
                    doc.add(Field("tag", tag, t2))
                else:
                    print "warning: no tag in %s" % filename
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Example #22
0
    def setUp(self):
        super(Test_Bug1763, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.d1 = RAMDirectory()
        self.d2 = RAMDirectory()

        w1, w2 = [
            self.getWriter(directory=d, analyzer=self.analyzer)
            for d in [self.d1, self.d2]
        ]
        doc1 = Document()
        doc2 = Document()
        doc1.add(
            Field("all", "blah blah double blah Gesundheit",
                  TextField.TYPE_NOT_STORED))
        doc1.add(Field('id', '1', StoredField.TYPE))
        doc2.add(
            Field("all", "a quick brown test ran over the lazy data",
                  TextField.TYPE_NOT_STORED))
        doc2.add(Field('id', '2', StoredField.TYPE))
        w1.addDocument(doc1)
        w2.addDocument(doc2)
        for w in [w1, w2]:
            w.close()
    def indexDocs(self, sourceDir, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for sourceDir, dirnames, filenames in os.walk(sourceDir):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print(filename)
                try:
                    path = os.path.join(sourceDir, filename)
                    file = open(path, 'r', encoding="utf-8")
                    contents = file.read()
                    #contents = str(filecontent, 'utf-8')
                    #contents = filecontent.encode('utf-8')
                    #print('path', path, len(contents))
                    doc = Document()
                    doc.add(Field("name", filename, t1))  # filename (title)
                    #doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field(queryField, contents, t2))  # content
                    else:
                        print("warning: no content in %s" % filename)
                    writer.addDocument(doc)
                    file.close()
                except NameError:
                    print("Failed in indexDocs:")
def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)
Example #25
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(False)
        t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        total=0
        file = open(root,"r")
        for line in file.readlines():
            try:
                imgurl, itemurl, content = line.split('\t')
                total+=1
                print total
                print "adding", content
                contents = ' '.join(jieba.cut(content))
                doc = Document()
                doc.add(Field("imgurl", imgurl, t1))
                doc.add(Field("itemurl", itemurl, t1))
                doc.add(Field("title", content, t1))
                doc.add(Field("contents",contents,t3))
                writer.addDocument(doc)
            except Exception, e:
                    print "Failed in indexDocs:", e
Example #26
0
def indexDictionary(d, writer):
    for k, v in d.iteritems():
        doc = Document()
        doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    return writer.numDocs()
Example #27
0
    def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info):
        """
            Add a document to the index. Does this using direct Lucene access.

            :param new_doc: dict of fields with values
            :type new_doc:dict
            :param metadata: ditto
            :type metadata:dict
            :param fields_to_process: only add these fields from the doc dict
            :type fields_to_process:list
        """
        doc = Document()
        total_numTerms=bow_info["total_numterms"]
        # each BOW now comes with its field
        for field in fields_to_process:
            field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)
##            boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            field_object.setBoost(float(boost))
            doc.add(field_object)

        json_metadata=json.dumps(metadata)
        doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO))
        doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
Example #28
0
def index_docs(root, writer):
    # metadata: name and path
    metadata = FieldType()
    metadata.setStored(True)  # as is value
    metadata.setTokenized(False)
    metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    # content: abstract and body
    content_type = FieldType()
    content_type.setStored(True)  # to highlight on search results
    content_type.setTokenized(True)  # tokenize words
    content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for directory, _, file_names in walk(root):
        for file_name in file_names:
            name, extension = splitext(file_name)
            if extension not in DOC_FORMATS:
                continue  # skip unsupported formats

            file_path = join(directory, file_name)
            print ' ', file_path

            # Build indexed document
            doc = Document()
            doc.add(Field('name', file_name, metadata))
            doc.add(Field('path', directory, metadata))

            # Read file contents
            content = process(file_path, 'utf-8', method='pdfminer')
            abstract = extract_abstract(content)
            doc.add(Field('content', content, content_type))
            doc.add(Field('abstract', abstract, content_type))

            writer.addDocument(doc)
Example #29
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        wikiFile = ZipFile(root, 'r')
        files = wikiFile.namelist()

        i = 0
        for file in files[1:]:
            i += 1
            wiki = wikiFile.open(file, 'r')
            for line in wiki:
                for line in codecs.iterdecode(wiki, 'utf8'):
                    normailized = unicodedata.normalize('NFD',
                                                        line).split(' ', 2)
                    if not normailized[1].isdigit(): continue
                    docname = normailized[0] + ' ' + normailized[1]
                    name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0])
                    contents = normailized[2]
                    doc = Document()
                    doc.add(Field('docname', docname, t1))
                    doc.add(Field('name', name, t1))
                    doc.add(Field('contents', contents, t1))
                    writer.addDocument(doc)
            print('File %d done indexing' % i, file)
Example #30
0
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
Example #31
0
def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()
 def __init__(self, path):
     lazyImport()
     self._writer, self._reader, self._searcher = self._getLucene(path)
     self._latestModifications = {}
     self._doc = Document()
     self._keyField = StringField("key", "", Field.Store.NO)
     self._valueField = Field("value", "", UNINDEXED_TYPE)
     self._doc.add(self._keyField)
     self._doc.add(self._valueField)
    def add_article(self, article):
        # constructing a document
        doc = Document()

        title = Field('title', article.title, Field.Store.YES, Field.Index.ANALYZED)
        title.setBoost(10.0)
        doc.add(title)

        description = Field('description', article.description, Field.Store.YES, Field.Index.ANALYZED)
        description.setBoost(5.0)
        doc.add(description)

        doc.add(Field('keywords', article.keywords, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', article.content, Field.Store.YES, Field.Index.ANALYZED))
        if article.date:
            doc.add(Field('date', article.date, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if article.last_modified:
            doc.add(Field('last_modified', article.last_modified, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if article.images:
            doc.add(Field('image_url', article.images[0][0], Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field('image_text', article.images[0][1], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field('url', article.url, Field.Store.YES, Field.Index.NOT_ANALYZED))

        # creates document or updates if already exists
        self.writer.updateDocument(Term("url", article.url), doc)
    def testDocBoost(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT))
    
        f1 = Field("field", "word", TextField.TYPE_STORED)
        f2 = Field("field", "word", TextField.TYPE_STORED)
        f2.setBoost(2.0)
    
        d1 = Document()
        d2 = Document()
    
        d1.add(f1)                                 # boost = 1
        d2.add(f2)                                 # boost = 2
    
        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.close()

        scores = [0.0] * 2

        class collector(PythonCollector):
            def __init__(_self, scores):
                super(collector, _self).__init__()
                _self.scores = scores
                _self.base = 0
            def collect(_self, doc, score):
                _self.scores[doc + _self.base] = score
            def setNextReader(_self, context):
                _self.base = context.docBase
            def acceptsDocsOutOfOrder(_self):
                return True

        self.getSearcher().search(TermQuery(Term("field", "word")),
                                  collector(scores))
    
        lastScore = 0.0
        for score in scores:
            self.assert_(score > lastScore)
            lastScore = score
Example #35
0
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
Example #36
0
File: views.py Project: kevkid/YIF
def survey(request):
    ipAddr = get_client_ip(request)
    instances = (Classes.objects.values_list('image_class_desc'))
    instances = [i[0] for i in instances]
    #cnt = len(instances)
    #lets get out choice
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    
        
    try:
        #image_class = image.objects.get(pk=request.POST['survey'])
        s = request.POST['survey']#get from post
        
                
    except (KeyError, Classes.DoesNotExist):
        return render(request, 'web/index.html',{
            'error_message': "You didn't select a choice.",
        })
    else:
        image_class = instances[int(s)]
        docNum = request.POST['imageID']#get document id
        doc = reader.document(int(docNum))
        fname = doc.get("filename")
        print(fname)
        #SimpleFSDirectory(File(location)).clearLock(IndexWriter.WRITE_LOCK_NAME);
        fileClassField = doc.get("Classification")
        if str(fileClassField) == "None":#check if the field exists####NEED TO CHECK THIS
            fileClassField = str(ipAddr + ":" + image_class)#I think we must add an ip address to this
        else:
            fileClassField = str(ipAddr + ":" + fileClassField) + ", " + image_class
            
        #doc.removeField("Classification")
        
        #doc.add(StringField("Classification", fileClassField, Field.Store.YES))
        #t = doc.get("Classification")
        #reader.close()
        indexDir = SimpleFSDirectory(File(location))
        writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
        writer = IndexWriter(indexDir, writerConfig)
        fields = doc.getFields()#get all fields
        doc2 = Document()
        classificationFieldFlag = False
        for f in fields:
            field = Field.cast_(f)
            (k, v) = field.name(), field.stringValue()
            if k == "Classification":
                classificationFieldFlag = True
                field = StringField("Classification", fileClassField, Field.Store.YES)
                doc2.add(field)
            else:
                doc2.add(field)

        if classificationFieldFlag == False:#this does not exist in the document must add
            doc2.add(StringField("Classification", fileClassField, Field.Store.YES))
#         doc2.add(StringField("Classification", fileClassField, Field.Store.YES))
#         doc2.add(StringField("fid", doc.get("fid"), Field.Store.YES))
#         doc2.add(StringField("articleid", doc.get("articleid"), Field.Store.YES))
#         doc2.add(StringField("caption", doc.get("caption"), Field.Store.YES))
#         doc2.add(StringField("figureid", doc.get("figureid"), Field.Store.YES))
#         doc2.add(StringField("filename", doc.get("filename"), Field.Store.YES))
#         doc2.add(StringField("filepath", doc.get("filepath"), Field.Store.YES))
#         doc2.add(StringField("label", doc.get("label"), Field.Store.YES))
        
        #writer.updateDocument(Term("fid","f000000000023"), doc2)#If field exists update
        writer.updateDocument(Term("fid", doc.get("fid")), doc2)#If field exists update
        writer.commit();
        #writer.optimize()
        writer.close()
        #writer.unlock(SimpleFSDirectory(File(location)))
        
    return HttpResponseRedirect(reverse('web:index', args=()))
    def _getIndex(self, even, odd):

        mergePolicy = LogDocMergePolicy()
        mergePolicy.setMergeFactor(1000)
        directory = RAMDirectory()
        self.dirs.append(directory)

        writer = self.getWriter(directory=directory,
                                analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT),
                                maxBufferedDocs=2, mergePolicy=mergePolicy)

        if self.dvStringSorted:
            # Index sorted
            stringDVType = FieldInfo.DocValuesType.SORTED
        elif self.notSorted:
            # Index non-sorted
            stringDVType = FieldInfo.DocValuesType.BINARY
        else:
            # sorted anyway
            stringDVType = FieldInfo.DocValuesType.SORTED

        ft1 = FieldType()
        ft1.setStored(True)
        ft2 = FieldType()
        ft2.setIndexed(True)

        for i in xrange(len(self.data)):
            if (i % 2 == 0 and even) or (i % 2 == 1 and odd):
                doc = Document()
                doc.add(Field("tracer", self.data[i][0], ft1))
                doc.add(TextField("contents", self.data[i][1], Field.Store.NO))
                if self.data[i][2] is not None:
                    doc.add(StringField("int", self.data[i][2], Field.Store.NO))
                    if self.supportsDocValues:
                        doc.add(NumericDocValuesField("int_dv", Long.parseLong(self.data[i][2])))
                if self.data[i][3] is not None:
                    doc.add(StringField("float", self.data[i][3], Field.Store.NO))
                    if self.supportsDocValues:
                        doc.add(FloatDocValuesField("float_dv", Float.parseFloat(self.data[i][3])))

                if self.data[i][4] is not None:
                    doc.add(StringField("string", self.data[i][4], Field.Store.NO))
                    if self.supportsDocValues:
                        if stringDVType == FieldInfo.DocValuesType.SORTED:
                            doc.add(SortedDocValuesField("string_dv", BytesRef(self.data[i][4])))
                        elif stringDVType == FieldInfo.DocValuesType.BINARY:
                            doc.add(BinaryDocValuesField("string_dv", BytesRef(self.data[i][4])))
                        else:
                            raise ValueError("unknown type " + stringDVType)

                if self.data[i][5] is not None:
                    doc.add(StringField("custom", self.data[i][5], Field.Store.NO))
                if self.data[i][6] is not None:
                    doc.add(StringField("i18n", self.data[i][6], Field.Store.NO))
                if self.data[i][7] is not None:
                    doc.add(StringField("long", self.data[i][7], Field.Store.NO))
                if self.data[i][8] is not None:
                    doc.add(StringField("double", self.data[i][8], Field.Store.NO))
                    if self.supportsDocValues:
                        doc.add(NumericDocValuesField("double_dv", Double.doubleToRawLongBits(Double.parseDouble(self.data[i][8]))))
                if self.data[i][9] is not None:
                    doc.add(StringField("short", self.data[i][9], Field.Store.NO))
                if self.data[i][10] is not None:
                    doc.add(StringField("byte", self.data[i][10], Field.Store.NO))
                if self.data[i][11] is not None:
                    doc.add(StringField("parser", self.data[i][11], Field.Store.NO))

                for f in doc.getFields():
                    if f.fieldType().indexed() and not f.fieldType().omitNorms():
                        Field.cast_(f).setBoost(2.0)

                writer.addDocument(doc)

        reader = writer.getReader()
        writer.close()

        return self.getSearcher(reader=reader)
    def _getFullStrings(self):

        mergePolicy = LogDocMergePolicy()
        mergePolicy.setMergeFactor(97)
        directory = RAMDirectory()
        self.dirs.append(directory)

        writer = self.getWriter(directory=directory,
                                analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT),
                                maxBufferedDocs=4, mergePolicy=mergePolicy)
        
        onlyStored = FieldType()
        onlyStored.setStored(True)
        fixedLen = self.getRandomNumber(2, 8)
        fixedLen2 = self.getRandomNumber(1, 4)

        for i in xrange(NUM_STRINGS):
            doc = Document()

            num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52)
            doc.add(Field("tracer", num, onlyStored))
            doc.add(StringField("string", num, Field.Store.NO))
            if self.supportsDocValues:
                if self.dvStringSorted:
                    doc.add(SortedDocValuesField("string_dv", BytesRef(num)))
                else:
                    doc.add(BinaryDocValuesField("string_dv", BytesRef(num)))

            num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50)
            doc.add(StringField("string2", num2, Field.Store.NO))
            if self.supportsDocValues:
                if self.dvStringSorted:
                    doc.add(SortedDocValuesField("string2_dv", BytesRef(num2)))
                else:
                    doc.add(BinaryDocValuesField("string2_dv", BytesRef(num2)))
            doc.add(Field("tracer2", num2, onlyStored))
            for f2 in doc.getFields():
                if f2.fieldType().indexed() and not f2.fieldType().omitNorms():
                    Field.cast_(f2).setBoost(2.0)

            numFixed = self.getRandomCharString(fixedLen, 48, 52)
            doc.add(Field("fixed_tracer", numFixed, onlyStored))
            doc.add(StringField("string_fixed", numFixed, Field.Store.NO))
            if self.supportsDocValues:
                if self.dvStringSorted:
                    doc.add(SortedDocValuesField("string_fixed_dv", BytesRef(numFixed)))
                else:
                    doc.add(BinaryDocValuesField("string_fixed_dv", BytesRef(numFixed)))

            num2Fixed = self.getRandomCharString(fixedLen2, 48, 52)
            doc.add(StringField("string2_fixed", num2Fixed, Field.Store.NO))
            if self.supportsDocValues:
                if self.dvStringSorted:
                    doc.add(SortedDocValuesField("string2_fixed_dv", BytesRef(num2Fixed)))
                else:
                    doc.add(BinaryDocValuesField("string2_fixed_dv", BytesRef(num2Fixed)))
            doc.add(Field("tracer2_fixed", num2Fixed, onlyStored))
            for f2 in doc.getFields():
                if f2.fieldType().indexed() and not f2.fieldType().omitNorms():
                    Field.cast_(f2).setBoost(2.0)

            writer.addDocument(doc)

        writer.close()

        return self.getSearcher(directory=directory)
	def reindex(self):
		''' Re-indexes the entire database into Index file'''
		start = time.time()

		# get all posts
		posts = self._tuples_to_dict(self._fetch_all_questions(), self._posts_fields)
		if not posts:
			raise Exception("FATAL Error: Could not fetch posts from Database")

		# open indexer
		# lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION

		store = SimpleFSDirectory(File(self.index_dir))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer = IndexWriter(store, config)

		indexedField = FieldType()
		indexedField.setIndexed(True)
		indexedField.setStored(True)
		indexedField.setTokenized(True)
		indexedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

		storedField = FieldType()
		storedField.setIndexed(False)
		storedField.setStored(True)
		storedField.setTokenized(False)
		storedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

		fieldTypes = {
						'type'		: storedField,
						'id'		: storedField,
						'title'		: indexedField,
						'question'	: indexedField,
						'answer'	: indexedField,
						# 'comment'	: indexedField,
						'tag'		: indexedField,
						'extra'		: indexedField,
		}

		# get their comments
		num_docs = 0
		for post in posts:
			if self.status_mode: print "\r {0:.2f} %complete".format(((num_docs/142627.0)*100)),
			if self.debug : print "\n","*"*20,"\nIndexing post: ", post['id'], "from ", post['extra']
			if self.debug and self.verbose_values: print post
			answers = self._tuples_to_dict(self._fetch_all_answers(post['id'], post['extra']), self._answer_fields)


			# add comment field
			for answer in answers:
				num_docs += 1
				if self.debug: print "\n","+"*10, "\nMaking new Document"
				doc = Document()
				if self.debug: print "Adding doc type"
				doc.add(Field("type", self.doctype, fieldTypes['type']))
				
				# make fields
				if self.debug: print "Adding post fields"
				for i in xrange(len(self._posts_fields)):
					f = Field(self._posts_fields[i], self._cleanup_tag(post[self._posts_fields[i]]), fieldTypes[self._posts_fields[i]])
					f.setBoost(self._fields_boost[self._posts_fields[i]])
					doc.add(f)


				if self.status_mode: print "\t Indexing answer: ", answer['answer_id']
				if self.debug and self.verbose_values: print answer
				# answered_doc = copy.deepcopy(doc)
				# make comment field
				f = Field("answer", self._cleanup_tag(answer['answer']), fieldTypes['answer'])
				f.setBoost(self._fields_boost['answer'])
				doc.add(f)
				# calculate paths
				# commented_doc = copy.deepcopy(answered_doc)
				# comments = self._comments_to_comment_string(self._tuples_to_dict(self._fetch_all_comments(answer['id']), self._comment_fields))

				# if self.debug: print "\t\tAdding comments: ", comments
				# commented_doc.add(Field("comment", self._cleanup_tag(comments), fieldTypes['comment']))

				# write index
				if self.debug: print "\tAdding document {doc_id} to index".format(doc_id=post['id'])
				writer.addDocument(doc)

				# del answered_doc
				# del commented_doc

			if self.debug: print "Commiting document to index"
			writer.commit()

		# close index
		if self.status_mode: print "Closing index write"
		writer.close()
		end = time.time() - start

		if self.status_mode: print "\n","-"*20, \
			"\nTotal time spent in indexing: ", end, "seconds" \
			"\nIndexed {num_docs} documents".format(num_docs=num_docs)
class LuceneKeyValueStore(object):
    def __init__(self, path):
        lazyImport()
        self._writer, self._reader, self._searcher = self._getLucene(path)
        self._latestModifications = {}
        self._doc = Document()
        self._keyField = StringField("key", "", Field.Store.NO)
        self._valueField = Field("value", "", UNINDEXED_TYPE)
        self._doc.add(self._keyField)
        self._doc.add(self._valueField)

    def get(self, key, default=None):
        try:
            return self[key]
        except KeyError:
            return default

    def __setitem__(self, key, value):
        key = str(key)
        value = str(value)
        self._maybeReopen()
        self._keyField.setStringValue(key)
        self._valueField.setStringValue(value)
        self._writer.updateDocument(Term("key", key), self._doc)
        self._latestModifications[key] = value

    def __getitem__(self, key):
        key = str(key)
        value = self._latestModifications.get(key)
        if value is DELETED_RECORD:
            raise KeyError(key)
        if not value is None:
            return value
        self._maybeReopen()
        topDocs = self._searcher.search(TermQuery(Term("key", key)), 1)
        if topDocs.totalHits == 0:
            raise KeyError(key)
        return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value")

    def __delitem__(self, key):
        key = str(key)
        self._writer.deleteDocuments(Term("key", key))
        self._latestModifications[key] = DELETED_RECORD

    def __len__(self):
        raise NotImplementedError

    def __iter__(self):
        raise NotImplementedError

    def items(self):
        raise NotImplementedError

    def keys(self):
        raise NotImplementedError

    def values(self):
        raise NotImplementedError

    def _getLucene(self, path):
        directory = FSDirectory.open(Paths.get(path))
        config = IndexWriterConfig(None)
        config.setRAMBufferSizeMB(256.0) # faster
        config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
        writer = IndexWriter(directory, config)
        reader = writer.getReader()
        searcher = IndexSearcher(reader)
        return writer, reader, searcher

    def _maybeReopen(self):
        if len(self._latestModifications) > 10000:
            newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True)
            if not newReader is None:
                self._reader.close()
                self._reader = newReader
                self._searcher = IndexSearcher(self._reader)
                self._latestModifications.clear()

    def commit(self):
        self._writer.commit()

    def close(self):
        self._writer.close()
    def indexTable(self, writer):

        #connection 
        con = None

        #define the index of all the fields
        #---------step 2:connect to mysql----------
        con = mdb.connect('localhost','root','testgce','douban_movie_v3')

        #t_num = FieldType.NumericType it is wrong!!
        t_num = FieldType()
        t_num.setStored(False)

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        maxDict = utils.maxDict
        #加权数值范围
        base = DOC_BOOST_RANGE[0]
        upper = DOC_BOOST_RANGE[1]

        with con:
            # Careful with codecs
            con.set_character_set('utf8')

            cur = con.cursor()
            # Aagin the codecs
            cur.execute('SET NAMES utf8;')
            cur.execute('SET CHARACTER SET utf8;')
            cur.execute('SET character_set_connection=utf8;')
            
            #------step 3: choose the right table------
            cur.execute("SELECT * FROM movie_items")

            numrows = int(cur.rowcount)
            print 'numrows:',numrows
            for i in range(numrows):
                print
                row = cur.fetchone()

                #------step 4:Index your field------
                summary = row[SUMMARY]  
                subject_id = row[SUBJECT_ID]


                print 'id'+subject_id
                year = utils.formatYear(row[YEAR])
                try:
                    date = DateTools.stringToDate(year.replace('-',' '))
                    wtfFile = open('wtf.txt','a')
                    dateStr  = DateTools.dateToString(date,DateTools.Resolution.DAY)
                except:
                    wtfFile.write(year+'\n')

                        

                doc = Document()

                #boosting
                boostProb = utils.calcBoostProb(row,maxDict,dateStr)
                boost = base + boostProb*(upper-base)

                doc.add(FloatField("boost",boost,Field.Store.YES))
                doc.add(StringField("year",dateStr,Field.Store.YES))
                print 'dateStr:'+dateStr
                #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.)

                do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0
                wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0

                #fields which should not be analyzed
                doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES))
                doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES))
                doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES))
                #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost))
                doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES))
                doc.add(IntField("do_count", int(do_count), Field.Store.YES))
                doc.add(IntField("wish_count", int(wish_count), Field.Store.YES))
                doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES))
                doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES))
                doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES))
                doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES))

                #fields which should be analyzed with WhitespaceAnalyzer
                #attention!!! dont use a long sentence like :
                #doc.add(Field("genres",    row[GENRES].replace(delim,' '),    t3).setBoost(boost))
                #or you'll get a null pointer error
                f = Field("countries", row[COUNTRIES].replace(delim,' '), t3)
                f.setBoost(boost)
                doc.add(f)

                #process casts
                raw_casts = row[CASTS].replace(delim,' ')
                f = Field("raw_casts", raw_casts , t1)
                f.setBoost(boost)
                doc.add(f)

                #将英文人名中的 ·
                raw_casts = raw_casts.replace('·',' ')
                
                if len(raw_casts.split(' '))<CASTS_LEN:
                    #平局人名长度是4
                    casts = raw_casts + ' ¥¥¥¥'*(CASTS_LEN-len(raw_casts.split(' ')))
                f = Field("casts", casts , t3)
                f.setBoost(boost)
                doc.add(f)

                #process directors
                raw_directors = row[DIRECTORS].replace(delim,' ')
                f = Field("raw_directors",raw_directors, t1)
                f.setBoost(boost)
                doc.add(f)

                #将英文人名中的 · 替换
                raw_directors = raw_directors.replace('·',' ')

                if len(raw_directors.split(' '))<DIRECTORS_LEN:
                    #平局人名长度是4
                    directors = raw_directors + ' ¥¥¥¥'*(DIRECTORS_LEN-len(raw_directors.split(' ')))
                f = Field("directors", directors, t3)
                f.setBoost(boost)
                doc.add(f)

                Field("genres",    row[GENRES].replace(delim,' '),    t3)
                f.setBoost(boost)
                doc.add(f)

                Field("subtype",   row[SUBTYPE].replace(delim,' '),   t3)
                f.setBoost(boost)
                doc.add(f)

                #it is wrong cause indexable field has no method setBoost
                # fieldList = doc.getFields()  # is not a python 'list' , but a 'List' which is unindexable                
                # for eachField in fieldList:
                #     eachField.setBoost(boost)


                #user_tags 原始字符串要存,reRank要用:
                doc.add(StringField("raw_user_tags",row[USER_TAGS],Field.Store.YES))
                doc.add(StringField("raw_others_like",row[OTHERS_LIKE],Field.Store.YES))
                

                user_tags_str = ''
                others_like_str = ''
                tags_len = 0
                

                if row[USER_TAGS]!='':
                    user_tags_list = row[USER_TAGS].split(delim) 
                    for tag_pair in user_tags_list:
                        if tag_pair!='':#字符串的最后一个字符是¥,这样split之后最后一个元素是空字符
                            #print 'tag_pair'+tag_pair+'hhe'
                            tag_name = tag_pair.split(delim_uo)[0]+' ' # dont forget this space !!
                            tag_num = tag_pair.split(delim_uo)[1]
                            tag_num_processed = int(int(tag_num)/TAG_SPAN)+1 #最小为1
                            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                            user_tags_str = user_tags_str +' '+ tag_name * tag_num_processed
                            tags_len = tags_len + tag_num_processed #最后得到总共词的个数


                if tags_len<TAGS_AVER_LEN:
                    #填充tags,目测3是平均长度,所以使用 ¥¥¥
                    user_tags_str = user_tags_str +' ¥¥¥'*(TAGS_AVER_LEN - tags_len)
                #


                if row[OTHERS_LIKE]!='':
                    for like_pair in row[OTHERS_LIKE].split(delim):
                        if like_pair!='':
                            others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1]


                #start process adjs
                if row[ADJS] != None:
                    raw_adjs = row[ADJS][:-1]

                    adjs_str = ''
                    adjs_len = 0
                    if row[ADJS] != '' and row[ADJS] != '\n':
                        #'重要=4.0,特殊=4.0'
                        adjs_str = row[ADJS]
                        adjs_list = adjs_str.split(',')
                        for adj_pair in adjs_list:
                            #print 'adj_pair:'+adj_pair+'hhe'
                            adj_name = adj_pair.split('=')[0]
                            adj_num = adj_pair.split('=')[1]

                            #去换行符,转换int
                            if adj_num[-1] == '\n':
                                adj_num = adj_num[0:-1]
                            adj_num = int(float(adj_num))

                            add_adj=''
                            # #同义词
                            # adj_name_bro = searchDictValue(adjMap,adj_name)
                            # if adj_name_bro == -1: #表示没有结果,即未找到近义词,不添加
                            #     add_adj = ''
                            # else:
                            #     add_adj = (adj_name_bro+' ')*adj_num
                            #     raw_adjs = raw_adjs + ',' + adj_name_bro+'='+str(adj_num)
                                
                            adjs_str = adjs_str + ' ' + (adj_name+' ') * adj_num +add_adj
                            adjs_len = adjs_len + adj_num #最后得到总共tags的个数

                    #print raw_adjs
                    doc.add(StringField("raw_adjs",raw_adjs,Field.Store.YES))

                    if adjs_len<ADJS_AVER_LEN:
                        #填充 adjs_str,目测2是平均长度,所以使用 "¥¥"
                        adjs_str = adjs_str +' ¥¥'*(ADJS_AVER_LEN - adjs_len)

                    f = Field("adjs", adjs_str, t3)
                    f.setBoost(boost)
                    doc.add(f)

                f = Field("user_tags", user_tags_str, t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("others_like", others_like_str, t3)
                f.setBoost(boost)
                doc.add(f)



                #fields which should be analyzed with good analyzer
                f = Field("title", row[TITLE], t3)                
                f.setBoost(boost)
                doc.add(f)

                f = Field("original_title", row[ORIGINAL_TITLE], t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("aka", row[AKA], t2)
                f.setBoost(boost)
                doc.add(f)

                if len(summary) > 0:
                    print subject_id +'--->'+':\n    '+ row[TITLE]
                    try:
                        summary_unicoded = unicode(summary, 'utf-8') #test the encoding 
                    except Exception,e:
                        print "Decode Failed: ", e
                    f = Field('summary', summary, t2)
                    f.setBoost(boost)
                    doc.add(f)
                else:
                    print "warning:\n" + subject_id +'---> No content!'
                print 'boosting:' + str(boost)

                #for debug
                if boost>upper:
                    print boostProb
                    print maxDict
                    
                    exit(0)

                writer.addDocument(doc)
    def testSimple(self):
        writer = self.getWriter(analyzer=SimpleAnalyzer())

        doc = Document()
        field = Field("foo", "", TextField.TYPE_NOT_STORED)
        doc.add(field)

        dvField = FloatDocValuesField("foo_boost", 0.0)
        doc.add(dvField)

        field2 = Field("bar", "", TextField.TYPE_NOT_STORED)
        doc.add(field2)

        field.setStringValue("quick brown fox")
        field2.setStringValue("quick brown fox")
        dvField.setFloatValue(2.0)  # boost x2
        writer.addDocument(doc)

        field.setStringValue("jumps over lazy brown dog")
        field2.setStringValue("jumps over lazy brown dog")
        dvField.setFloatValue(4.0)  # boost x4
        writer.addDocument(doc)

        reader = writer.getReader()
        writer.close()

        # no boosting
        searcher1 = self.getSearcher(reader=reader)
        base = searcher1.getSimilarity(True)

        # boosting
        searcher2 = self.getSearcher(reader=reader)

        class _similarity(PythonPerFieldSimilarityWrapper):

            def __init__(_self, base):
                super(_similarity, _self).__init__()
                _self.base = base
                _self.fooSim = BoostingSimilarity(base, "foo_boost")

            def get(_self, field):
                return _self.fooSim if "foo" == field else _self.base

        searcher2.setSimilarity(_similarity(base))

        # in this case, we searched on field "foo". first document should have
        # 2x the score.
        tq = TermQuery(Term("foo", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)

        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(
            boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0,
            SCORE_EPSILON)

        # this query matches only the second document, which should have 4x
        # the score.
        tq = TermQuery(Term("foo", "jumps"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(
            boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0,
            SCORE_EPSILON)

        # search on on field bar just for kicks, nothing should happen, since
        # we setup our sim provider to only use foo_boost for field foo.
        tq = TermQuery(Term("bar", "quick"))
        noboost = searcher1.search(tq, 10)
        boost = searcher2.search(tq, 10)
        self.assertEqual(1, noboost.totalHits)
        self.assertEqual(1, boost.totalHits)

        self.assertEqual(
            boost.scoreDocs[0].score, noboost.scoreDocs[0].score,
            SCORE_EPSILON)

        reader.close()