Esempi in Python per Field.Field, esempi in Python per org.apache.lucene.document.Field.Field

Esempio n. 1

0

Mostra file

    def indexer(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        def repalcer(text):
            chars = '\\`*_{}[]()>#+-.!$‘'
            for c in chars:
                if c in text:
                    text = text.replace(c, ' ')
            return text

        for root, dirnames, filenames in os.walk(root):
            i = 0
            for filename in filenames:
                i += 1
                with open(os.path.join(root, filename)) as f:
                    for line in f.readlines():
                        line = line.split(' ', 2)
                        docname = line[0] + ' ' + line[1]
                        name = repalcer(line[0])
                        contents = line[2]
                        doc = Document()
                        doc.add(Field('docname', docname, t1))
                        doc.add(Field('name', name, t1))
                        doc.add(Field('contents', contents, t1))
                        writer.addDocument(doc)
                print('File %d done indexing' % i)

Esempio n. 2

0

Mostra file

File: index_sent.py Progetto: sougata09/CNN-QA

    def indexsents(self, sentences, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for i, sent in enumerate(sentences):
            #print "adding",i, sent
            try:
                root = os.getcwd()
                #contents = unicode(sent, 'iso-8859-1')
                doc = Document()
                doc.add(Field("name", str(i), t1))
                doc.add(Field("path", root, t1))
                if len(sent) > 0:
                    doc.add(Field("contents", sent.lower(), t2))
                else:
                    print "warning: no content in %s" % str(i)
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexsents:", e

Esempio n. 3

0

Mostra file

    def build_index(self, dict_data):
        print("loading data...")
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for k, v in dict_data.items():
            doc = Document()
            doc.add(Field("id", k, t1))
            doc.add(Field("content", v, t2))
            self.writer.addDocument(doc)

        ticker = Ticker()
        print("commit index")
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        self.writer.close()
        ticker.tick = False
        print("done")

Esempio n. 4

0

Mostra file

    def build(self, index):

        writer = self.getWriter(directory=index.index,
                                analyzer=SimpleAnalyzer(
                                    Version.LUCENE_CURRENT))

        seed(101)
        for d in xrange(self.minId, self.maxId + 1):
            doc = Document()
            doc.add(Field("id", self.pad(d), StringField.TYPE_STORED))
            if index.allowNegativeRandomInts:
                r = randint(~self.MAX_INT, self.MAX_INT)
            else:
                r = randint(0, self.MAX_INT)

            if index.maxR < r:
                index.maxR = r

            if r < index.minR:
                index.minR = r

            doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED))
            doc.add(Field("body", "body", StringField.TYPE_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()

Esempio n. 5

0

Mostra file

File: build_index.py Progetto: swapUniba/recsys-popularity-analyzer

def indexMovie(movie):
    doc = Document()
    doc.add(Field('id', str(movie), StringField.TYPE_STORED))
    at_lest_one_field = False

    maybe_tags = movies_tags.query('item == @movie')
    if not maybe_tags.empty:
        tags = maybe_tags[['tags']].values.flatten()[0]
        doc.add(Field('tags', tags, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    maybe_description = movies_descriptions.query('item == @movie')
    if not maybe_description.empty:
        description = maybe_description[['description']].values.flatten()[0]
        doc.add(Field('description', description, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    maybe_genres = movies_genres.query('item == @movie')
    if not maybe_genres.empty:
        genres = maybe_genres[['genres']].values.flatten()[0]
        doc.add(Field('genres', genres, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    if at_lest_one_field:
        writer.addDocument(doc)

Esempio n. 6

0

Mostra file

File: bm_indexFiles.py Progetto: gauravpatil93/evaluation-framework

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        inFile = open(str(args["inputFile"]))
        indexName = inFile.readline()
        while (indexName != ''):
            print "adding", indexName
            doc = Document()
            doc.add(Field("name", indexName, t1))
            #doc.add(Field("path", root, t1))
            text = inFile.readline()
            if (len(text) > 0):
                print("contents: %s\n" % text)
                doc.add(Field("contents", text, t2))
            else:
                print "warning: no content in %s" % indexName
            indexName = inFile.readline()
            writer.addDocument(doc)
        inFile.close()

Esempio n. 7

0

Mostra file

File: mlucene.py Progetto: o0neup/deepHackQA

def indexDictionary(d, writer):
    for k, v in d.iteritems():
        doc = Document()
        doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    return writer.numDocs()

Esempio n. 8

0

Mostra file

def wikipedia_indexer(storage, wikipedia_file):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    f = open(wikipedia_file)

    for i, line in enumerate(f):
        text = line.strip().decode('utf-8').split('\t')
        title = text[0]
        if 'disambigu' in text[0] or len(text) < 2:
            continue
        text = text[1]
        doc = Document()
        doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
        writer.addDocument(doc)
        if writer.numDocs() % 1000 == 0:
            print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)

    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

Esempio n. 9

0

Mostra file

File: create_category_corpus.py Progetto: linxinshi/EntityRetrievalPAS

def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))

Esempio n. 10

0

Mostra file

File: lucene_search.py Progetto: zhanglae/QueryReformulator

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            if prm.top_tfidf > 0:
                words_idx = []
                words, _ = utils.top_tfidf(txt.lower(), self.idf,
                                           prm.top_tfidf, prm.min_term_freq)

                if len(words) == 0:
                    words.append('unk')

                for w in words:
                    if w in self.vocab:
                        words_idx.append(self.vocab[w])
                    else:
                        words_idx.append(-1)  # unknown words.

            else:
                txt_ = txt.lower()
                words_idx, words = utils.text2idx2([txt_], self.vocab,
                                                   prm.max_terms_per_doc)
                words_idx = words_idx[0]
                words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

Esempio n. 11

0

Mostra file

File: q1_index.py Progetto: ashayaan/text-processing

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setStoreTermVectors(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPositions(True)
        t2.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

        file_path = root + 'r52-train-all-terms.txt'
        fd = open(file_path)
        contents = fd.readlines()
        fd.close()
        contents_list = [x.strip() for x in contents]
        for i in xrange(len(contents_list)):
            try:
                [topic, content] = contents_list[i].split('\t')
                doc = Document()
                doc.add(Field("id", str(i), t1))
                doc.add(Field("topic", topic, t1))
                doc.add(Field("contents", content, t2))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e

Esempio n. 12

0

Mostra file

    def indexsents(self, sentences, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for i, sent in enumerate(sentences):
            # print 'adding',i, sent
            try:
                root = os.getcwd()
                # contents = unicode(sent, 'iso-8859-1')
                doc = Document()
                doc.add(Field('name', str(i), t1))
                doc.add(Field('path', root, t1))

                if len(sent) > 0:
                    doc.add(Field('contents', sent.lower(), t2))
                else:
                    print('warning: no content in %s' % str(i))

                writer.addDocument(doc)
            except Exception as e:
                print('Failed in indexsents:', e)

        writer.commit()
        writer.close()

Esempio n. 13

0

Mostra file

 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()

Esempio n. 14

0

Mostra file

File: IndexFiles.py Progetto: forestLoop/Learning-EE208

    def index_image(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        with open(os.path.join(root, "index.txt"), mode="r",
                  encoding="utf8") as index:
            count = 1
            for line in index:
                print("\r", count, end="", sep="")
                try:
                    image_url, content = line.strip().split()[:2]
                except ValueError as e:
                    print(e)
                    continue
                doc = Document()
                doc.add(Field("raw_content", content, t1))
                content = " ".join(
                    word for word in jieba.cut_for_search(content)
                    if word.strip() and word not in self.stop_words)

                doc.add(Field("url", image_url, t1))
                doc.add(Field("content", content, t2))
                writer.addDocument(doc)
                count += 1
            print("\n{count} image(s) added.".format(count=count))

Esempio n. 15

0

Mostra file

File: IndexFiles.py Progetto: deternan/Light-tools-Python-

    def indexDocs(self, sourceDir, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for sourceDir, dirnames, filenames in os.walk(sourceDir):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print(filename)
                try:
                    path = os.path.join(sourceDir, filename)
                    file = open(path, 'r', encoding="utf-8")
                    contents = file.read()
                    #contents = str(filecontent, 'utf-8')
                    #contents = filecontent.encode('utf-8')
                    #print('path', path, len(contents))
                    doc = Document()
                    doc.add(Field("name", filename, t1))  # filename (title)
                    #doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field(queryField, contents, t2))  # content
                    else:
                        print("warning: no content in %s" % filename)
                    writer.addDocument(doc)
                    file.close()
                except NameError:
                    print("Failed in indexDocs:")

Esempio n. 16

0

Mostra file

    def addDocument(self, writer, new_doc, metadata, fields_to_process,
                    bow_info):
        """
            Add a document to the index. Does this using direct Lucene access.

            :param new_doc: dict of fields with values
            :type new_doc:dict
            :param metadata: ditto
            :type metadata:dict
            :param fields_to_process: only add these fields from the doc dict
            :type fields_to_process:list
        """
        doc = Document()
        total_numTerms = bow_info["total_numterms"]
        # each BOW now comes with its field
        for field in fields_to_process:
            field_object = Field(field, new_doc[field], Field.Store.NO,
                                 Field.Index.ANALYZED, Field.TermVector.YES)
            ##            boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            boost = 1 / float(
                math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            field_object.setBoost(float(boost))
            doc.add(field_object)

        json_metadata = json.dumps(metadata)
        doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(
            Field("bow_info", json.dumps(bow_info), Field.Store.YES,
                  Field.Index.NO))
        doc.add(
            Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("year_from", metadata["year"], Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

Esempio n. 17

0

Mostra file

File: service_database.py Progetto: wrongtest/smart_music

def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()

Esempio n. 18

0

Mostra file

def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()

Esempio n. 19

0

Mostra file

File: IndexImages.py Progetto: MarkDana/SJTU-EE208-CV-SE

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(False)
        t3.setTokenized(True)#利用预先设置的analyzer进行分词，这里是根据空格
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        total=0
        file = open(root,"r")
        for line in file.readlines():
            try:
                imgurl, itemurl, content = line.split('\t')
                total+=1
                print total
                print "adding", content
                contents = ' '.join(jieba.cut(content))
                doc = Document()
                doc.add(Field("imgurl", imgurl, t1))
                doc.add(Field("itemurl", itemurl, t1))
                doc.add(Field("title", content, t1))
                doc.add(Field("contents",contents,t3))
                writer.addDocument(doc)
            except Exception, e:
                    print "Failed in indexDocs:", e

Esempio n. 20

0

Mostra file

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        wikiFile = ZipFile(root, 'r')
        files = wikiFile.namelist()

        i = 0
        for file in files[1:]:
            i += 1
            wiki = wikiFile.open(file, 'r')
            for line in wiki:
                for line in codecs.iterdecode(wiki, 'utf8'):
                    normailized = unicodedata.normalize('NFD',
                                                        line).split(' ', 2)
                    if not normailized[1].isdigit(): continue
                    docname = normailized[0] + ' ' + normailized[1]
                    name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0])
                    contents = normailized[2]
                    doc = Document()
                    doc.add(Field('docname', docname, t1))
                    doc.add(Field('name', name, t1))
                    doc.add(Field('contents', contents, t1))
                    writer.addDocument(doc)
            print('File %d done indexing' % i, file)

Esempio n. 21

0

Mostra file

File: build_para_uri_index.py Progetto: linxinshi/CUIS_TREC2018_CAR

def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)

Esempio n. 22

0

Mostra file

File: IndexFiles.py Progetto: rookie5372/FaceAlbum

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        index_file = open("index.txt", 'r')
        for line in index_file.readlines():

            try:
                src = line.strip().split('\t')[0]
                filename = line.strip().split('\t')[1]
                tag = line.strip().split('\t')[2]
                path = os.path.join(root, filename)

                doc = Document()
                doc.add(Field("name", filename, t1))
                doc.add(Field("path", root, t1))
                doc.add(Field("src", src, t1))

                if len(tag) > 0:
                    doc.add(Field("tag", tag, t2))
                else:
                    print "warning: no tag in %s" % filename
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e

Esempio n. 23

0

Mostra file

def index_docs(root, writer):
    # metadata: name and path
    metadata = FieldType()
    metadata.setStored(True)  # as is value
    metadata.setTokenized(False)
    metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    # content: abstract and body
    content_type = FieldType()
    content_type.setStored(True)  # to highlight on search results
    content_type.setTokenized(True)  # tokenize words
    content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for directory, _, file_names in walk(root):
        for file_name in file_names:
            name, extension = splitext(file_name)
            if extension not in DOC_FORMATS:
                continue  # skip unsupported formats

            file_path = join(directory, file_name)
            print ' ', file_path

            # Build indexed document
            doc = Document()
            doc.add(Field('name', file_name, metadata))
            doc.add(Field('path', directory, metadata))

            # Read file contents
            content = process(file_path, 'utf-8', method='pdfminer')
            abstract = extract_abstract(content)
            doc.add(Field('content', content, content_type))
            doc.add(Field('abstract', abstract, content_type))

            writer.addDocument(doc)

Esempio n. 24

0

Mostra file

def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (
        n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()