Python DateTools Examples, lucene.DateTools Python Examples

Example #1

0

Show file

File: TestDataDocumentHandler.py Project: bpgriner01/pylucene

    def indexFile(cls, writer, path, baseDir):

        input = file(path)
        props = {}
        while True:
            line = input.readline().strip()
            if not line:
                break
            name, value = line.split("=", 1)
            props[name] = value.decode("unicode-escape")
        input.close()

        doc = Document()

        # category comes from relative path below the base directory
        category = os.path.dirname(path)[len(baseDir) :]
        if os.path.sep != "/":
            category = category.replace(os.path.sep, "/")

        isbn = props["isbn"]
        title = props["title"]
        author = props["author"]
        url = props["url"]
        subject = props["subject"]
        pubmonth = props["pubmonth"]

        print title.encode("utf8")
        print author.encode("utf-8")
        print subject.encode("utf-8")
        print category.encode("utf-8")
        print "---------"

        doc.add(Field("isbn", isbn, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("category", category, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(
            Field(
                "title2",
                title.lower(),
                Field.Store.YES,
                Field.Index.NOT_ANALYZED_NO_NORMS,
                Field.TermVector.WITH_POSITIONS_OFFSETS,
            )
        )

        # split multiple authors into unique field instances
        authors = author.split(",")
        for a in authors:
            doc.add(
                Field("author", a, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)
            )

        doc.add(Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS))
        doc.add(
            Field("subject", subject, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)
        )
        doc.add(NumericField("pubmonth", Field.Store.YES, True).setIntValue(int(pubmonth)))

        d = DateTools.stringToDate(pubmonth)
        d = int(d.getTime() / (1000 * 3600 * 24.0))
        doc.add(NumericField("pubmonthAsDay").setIntValue(d))

        doc.add(
            Field(
                "contents",
                " ".join([title, subject, author, category]),
                Field.Store.NO,
                Field.Index.ANALYZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS,
            )
        )

        doc.add(Field("path", path, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("modified", DateField.dateToString(samplesModified), Field.Store.YES, Field.Index.NOT_ANALYZED))

        writer.addDocument(doc)

Example #2

0

Show file

File: TestDataDocumentHandler.py Project: lauromoraes/pylucene

    def indexFile(cls, writer, path, baseDir):

        input = file(path)
        props = {}
        while True:
            line = input.readline().strip()
            if not line:
                break
            name, value = line.split('=', 1)
            props[name] = value.decode('unicode-escape')
        input.close()

        doc = Document()

        # category comes from relative path below the base directory
        category = os.path.dirname(path)[len(baseDir):]
        if os.path.sep != '/':
            category = category.replace(os.path.sep, '/')

        isbn = props['isbn']
        title = props['title']
        author = props['author']
        url = props['url']
        subject = props['subject']
        pubmonth = props['pubmonth']

        print title.encode('utf8')
        print author.encode('utf-8')
        print subject.encode('utf-8')
        print category.encode('utf-8')
        print "---------"

        doc.add(Field("isbn", isbn, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("category", category, Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(
            Field("title2", title.lower(), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))

        # split multiple authors into unique field instances
        authors = author.split(',')
        for a in authors:
            doc.add(
                Field("author", a, Field.Store.YES, Field.Index.NOT_ANALYZED,
                      Field.TermVector.WITH_POSITIONS_OFFSETS))

        doc.add(
            Field("url", url, Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))
        doc.add(
            Field("subject", subject, Field.Store.NO, Field.Index.ANALYZED,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(
            NumericField("pubmonth", Field.Store.YES,
                         True).setIntValue(int(pubmonth)))

        d = DateTools.stringToDate(pubmonth)
        d = int(d.getTime() / (1000 * 3600 * 24.0))
        doc.add(NumericField("pubmonthAsDay").setIntValue(d))

        doc.add(
            Field("contents", ' '.join([title, subject, author, category]),
                  Field.Store.NO, Field.Index.ANALYZED,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))

        doc.add(Field("path", path, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("modified", DateField.dateToString(samplesModified),
                  Field.Store.YES, Field.Index.NOT_ANALYZED))

        writer.addDocument(doc)

Example #3

0

Show file

File: TestDataDocumentHandler.py Project: qiugen/pylucene-trunk

    def indexFile(cls, writer, path, baseDir):
        
        input = file(path)
        props = {}
        while True:
            line = input.readline().strip()
            if not line:
                break
            name, value = line.split('=', 1)
            props[name] = value.decode('unicode-escape')
        input.close()

        doc = Document()

        # category comes from relative path below the base directory
        category = os.path.dirname(path)[len(baseDir):]
        if os.path.sep != '/':
            category = category.replace(os.path.sep, '/')

        isbn = props['isbn']
        title = props['title']
        author = props['author']
        url = props['url']
        subject = props['subject']
        pubmonth = props['pubmonth']

        print title.encode('utf8')
        print author.encode('utf-8')
        print subject.encode('utf-8')
        print category.encode('utf-8')
        print "---------"

        doc.add(Field("isbn", isbn, StringField.TYPE_STORED))
        
        doc.add(Field("category", category, StringField.TYPE_STORED))
        
        # note: ft should be initialized once and re-used
        ft = FieldType()
        ft.setIndexed(True)
        ft.setTokenized(True)
        ft.setStored(True)
        ft.setStoreTermVectorPositions(True)
        ft.setStoreTermVectorOffsets(True)
        ft.freeze()
        doc.add(Field("title", title, ft))  
                            
        ft = FieldType(StringField.TYPE_STORED)
        ft.setIndexed(True)
        ft.setTokenized(False)
        ft.setOmitNorms(True)
        ft.setStoreTermVectorPositions(True)
        ft.setStoreTermVectorOffsets(True)
        doc.add(Field("title2", title.lower(), ft))

        # split multiple authors into unique field instances
        authors = author.split(',')
        ft = FieldType()
        ft.setIndexed(True)
        ft.setTokenized(False)
        ft.setStored(True)
        ft.setStoreTermVectorPositions(True)
        ft.setStoreTermVectorOffsets(True)
        for a in authors:
            doc.add(Field("author", a, ft))

        ft = FieldType()
        ft.setIndexed(True)
        ft.setTokenized(False)
        ft.setStored(True)
        ft.setOmitNorms(True)
        doc.add(Field("url", url, ft))
        
        ft = FieldType()
        ft.setIndexed(True)
        ft.setTokenized(True)
        ft.setStored(False)
        ft.setStoreTermVectorPositions(True)
        ft.setStoreTermVectorOffsets(True)
        doc.add(Field("subject", subject, ft))
        
        doc.add(IntField("pubmonth", int(pubmonth), Field.Store.YES))

        d = DateTools.stringToDate(pubmonth)
        d = int(d.getTime() / (1000 * 3600 * 24.0))
        doc.add(IntField("pubmonthAsDay", d, IntField.TYPE_NOT_STORED))

        ft = FieldType()
        ft.setIndexed(True)
        ft.setTokenized(True)
        ft.setStored(False)
        ft.setStoreTermVectorPositions(True)
        ft.setStoreTermVectorOffsets(True)
        doc.add(Field("contents", ' '.join([title, subject, author, category]),
                      ft))

        doc.add(Field("path", path,
                      StringField.TYPE_STORED))
        
        doc.add(Field("modified", DateTools.dateToString(samplesModified, DateTools.Resolution.MILLISECOND),
                      StringField.TYPE_STORED))

        writer.addDocument(doc)