Python FieldType.freeze Examples

Programming Language: Python

Namespace/Package Name: org.apache.lucene.document

Class/Type: FieldType

Method/Function: freeze

Examples at hotexamples.com: 6

Python FieldType.freeze - 6 examples found. These are the top rated real world Python examples of org.apache.lucene.document.FieldType.freeze extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FieldType(30)

setIndexOptions(30)

setIndexed(30)

setStored(30)

setTokenized(30)

setStoreTermVectors(18)

setStoreTermVectorPositions(11)

setStoreTermVectorOffsets(7)

freeze(4)

setDocValuesType(1)

setNumericPrecisionStep(1)

setNumericType(1)

setOmitNorms(1)

setStoreTermVectorPayloads(1)

storeTermVectors(1)

Example #1

Show file

File: fieldregistry.py Project: jerryba/meresco-lucene

def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f

Example #2

Show file

File: fieldregistry.py Project: jerryba/meresco-lucene

def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f

Example #3

Show file

File: index_docs.py Project: jwymbs23/pamphlets

def main():
    INDEX_DIR = "full_index1"
    DOCUMENTS_DIR = "/media/joseph/Windows8_OS/Users/Joseph/AppData/Local/lxss/home/jwymbs23/data_science_projects/french_pamphlets/frc-data-master/OCR_text/"
    # Initialize lucene and JVM
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print("lucene version is:", lucene.VERSION, '\n')

    store = getStore(INDEX_DIR)

    analyzer = getAnalyzer()

    writer = getWriter(store=store, analyzer=analyzer, create=True)

    #get list of documents
    doc_list = getDoclist(DOCUMENTS_DIR)

    ftype = FieldType()
    ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    ftype.setTokenized(True)
    ftype.setStoreTermVectors(True)
    ftype.freeze()

    for cd, doc_name in enumerate(doc_list):
        if not cd % 1000:
            print(cd, '--', len(doc_list))
        with open(doc_name, 'r') as d:
            doc_lines = d.readlines()
            full_text = ''.join([i.strip() for i in doc_lines]).lower()
            try:
                # create a document that would we added to the index
                doc = Document()

                # Add fields to this document
                #could process fname here instead of in the dataframe later
                doc.add(
                    Field("identifier",
                          doc_name.split('/')[-1], TextField.TYPE_STORED)
                )  #Store.YES))#, Field.Index.ANALYZED))
                doc.add(
                    Field("vectext", full_text, ftype)
                )  #TextField.TYPE_STORED, TermVector.YES, ))#Store.YES))#, Field.Index.ANALYZED))
                doc.add(Field("text", full_text, TextField.TYPE_STORED))
                # Add the document to the index
                writer.addDocument(doc)
            except:
                print("Failed in indexDocs: ", doc_name)
    #writer.optimize()
    writer.commit()

Example #4

Show file

File: test_bug1842.py Project: svn2github/pylucene

    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()

Example #5

Show file

    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()

Example #6

Show file

class Indexer:
    """
    Indexer Class
    """
    (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date",
                                                   "url", "tags", "timestamp")

    def __init__(self, indexDir="", debug=False, verbose=False):
        """
        :Parameters:
        - `indexDir`: Path where the Index will be saved. (Str)
        - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean)
        - `verbose`: Provide additional information about the initialization process. (Boolean)
        """
        self.__verbose = verbose
        if indexDir != "":
            INDEX_DIR = indexDir
        else:
            INDEX_DIR = os.path.dirname(
                os.path.realpath(__file__)) + "/luceneIndex"

        if not os.path.exists(INDEX_DIR):
            os.makedirs(INDEX_DIR)
            self.__boAppend = False
        else:
            self.__boAppend = True
        # Initialize lucene and JVM
        lucene.initVM()
        # Get index storage
        if debug:
            # Store the index in memory
            self.__indexDir = RAMDirectory()
            self.__boAppend = False
            INDEX_DIR = "RAM Memory"
        else:
            # Store an index on disk
            self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR))

        # Create Content FieldType
        self.__contentType = FieldType()
        self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        self.__contentType.setTokenized(True)
        self.__contentType.setStored(True)
        self.__contentType.setStoreTermVectors(True)
        self.__contentType.setStoreTermVectorPositions(True)
        self.__contentType.freeze()

        # Get the Analyzer
        self.__analyzer = StandardAnalyzer(
            StandardAnalyzer.ENGLISH_STOP_WORDS_SET)

        # Print Indexer Information
        print("Lucene version is: ", lucene.VERSION)
        print("Index Directory: ", INDEX_DIR)

    def __del__(self):
        self.__indexDir.close()

    ##################################################
    #Private Methods
    ##################################################
    @staticmethod
    def __getTimestamp(dateTime):
        """
        Converts the document's date to an integer timestamp

        :Parameters:
        - `dateTime`: Document's date  (Str)

        :Returns:
        - Date timestamp (Int)
        """
        tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ')
        sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format(
            tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min,
            tm.tm_sec)
        return int(sTime)

    @staticmethod
    def __getDateTime(timeStamp):
        """
        Converts the document's timestamp to date

        :Parameters:
        - `timeStamp`: Document's timestamp

        :Returns:
        - Date (Str)
        """
        date = datetime.datetime(year=int(timeStamp[0:4]),
                                 month=int(timeStamp[4:6]),
                                 day=int(timeStamp[6:8]),
                                 hour=int(timeStamp[8:10]),
                                 minute=int(timeStamp[10:12]),
                                 second=int(timeStamp[12:14]))
        return date.strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def __qualifyTags(tags):
        """
        Creates the qualify string for tags

        :Parameters:
        - `tags`: List of document's tags

        :Return:
        - Qualify Tags (Str)
        """
        sTags = ""
        for tag in tags:
            sTags += tag + '|'
        return sTags[:-1]

    @staticmethod
    def __scatterMatrix(numDocs, freqMtx):
        print("Scattering Frequency Matrix...")
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        matrix = []
        innerMatrix = ['Term']

        #Generate Document Columns
        for docIdx in range(numDocs):
            innerMatrix.append("D{0:0>4}".format(docIdx))
        matrix.append(innerMatrix)

        #Generate Word Rows and Columns
        for word in sorted(freqMtx):
            innerMatrix = []
            innerMatrix.append(word)
            for docIdx in range(numDocs):
                try:
                    termCount = round(freqMtx[word][str(docIdx)], 3)
                    innerMatrix.append(termCount)
                except KeyError:
                    innerMatrix.append(0)
            matrix.append(innerMatrix)
            pB.updateProgress()
        return matrix

    @staticmethod
    def __saveMatrix(numDocs, freqMtx):
        pathMatrix = os.path.dirname(
            os.path.realpath(__file__)) + "/freqMtx.txt"
        fMatrix = open(pathMatrix, 'w')

        print("Saving Frequency Matrix File: ", pathMatrix)
        pB = ProgressBar(len(freqMtx), prefix='Progress:')
        # File Generation Start
        print("+========= Frequency Matrix =========+", file=fMatrix)
        print("%20s" % (' '), end=' ', file=fMatrix)
        for docIdx in range(numDocs):
            print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix)
        print(file=fMatrix)
        for word in sorted(freqMtx):
            print("%20s" % (word), end=' ', file=fMatrix)
            for docIdx in range(numDocs):
                try:
                    termCount = freqMtx[word][str(docIdx)]
                    print("%02.03f" % (termCount), end=' ', file=fMatrix)
                except KeyError:
                    print("  0  ", end=' ', file=fMatrix)
            print(file=fMatrix)
            pB.updateProgress()
        # Close File
        fMatrix.close()

    def __stemString(self, stringToStem):
        stemmedTerms = []
        tknStream = self.__analyzer.tokenStream('STEM', stringToStem)
        stemmed = SnowballFilter(tknStream, "English")
        stemmed.reset()
        while stemmed.incrementToken():
            stemmedTerms.append(
                stemmed.getAttribute(CharTermAttribute.class_).toString())

        tknStream.close()
        return stemmedTerms

    @staticmethod
    def __normalize(qVector, freqMtx):
        for term in qVector:
            for docId in freqMtx:
                if (term in freqMtx[docId]) and (freqMtx[docId][term] >
                                                 qVector[term]):
                    qVector[term] = freqMtx[docId][term]

    @staticmethod
    def __dotProduct(aVector, bVector):
        """
        Calculate Dot Product

        :Parameters:
        - `aVector`: A Vector. (Dict)
        - `bVector`: B Vector. (Dict)

        :Returns:
        - Dot Product. (Int)
        """
        dotProduct = 0
        for term in aVector:
            if term in bVector:
                product = aVector[term] * bVector[term]
                dotProduct += product

        return dotProduct

    @staticmethod
    def __magnitude(vector):
        """
        Calculate Dot Product

        :Parameters:
        - `vector`: Query Vector. (Dict)

        :Returns:
        - Vector Magnitude. (Int)
        """
        # Magnitude of the vector is the square root of the dot product of the vector with itself.
        vectorMagnitude = Indexer.__dotProduct(vector, vector)
        vectorMagnitude = math.sqrt(vectorMagnitude)

        return vectorMagnitude

    ##################################################
    #Public Methods
    ##################################################
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

    def Search(self, query, field=NAME, maxResult=1000):
        """
        Search for a document into the Lucene's Index

        :Parameters:
        - `query`: Request to be made to the Index (Str).
        - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS).
        - `maxResult`: Maximum number of results.
        """
        # Get the Index Directory
        reader = DirectoryReader.open(self.__indexDir)
        searcher = IndexSearcher(reader)
        # Create a query
        queryParser = QueryParser(field, self.__analyzer).parse(query)
        # Do a search
        hits = searcher.search(queryParser, maxResult)
        print("Found %d document(s) that matched query '%s':" %
              (hits.totalHits, queryParser))
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score))
            print("Name: " + doc.get('name'))
            print("Tags: " + doc.get('tags') + "\n")
        reader.close()

    def StemDocument(self, docIdx):
        """
        Return an array of the document's stemmed terms

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx).get(Indexer.CONTENT)
        reader.close()

        return self.__stemString(doc)

    def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False):
        """
        Generates a Frequency Matrix of the current Index

        :Parameters:
        - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean)
        """
        freqMtx = {}  # Terms - DocumentID Matrix
        reader = DirectoryReader.open(self.__indexDir)
        numDocs = reader.numDocs()
        print("Generating Frequency Matrix...")
        pB = ProgressBar(numDocs - 1, prefix='Progress:')
        for docIdx in range(numDocs):
            termItr = self.StemDocument(docIdx)
            termSize = len(termItr)
            docStr = '{0}'.format(docIdx)
            termDict = {}
            for termText in termItr:
                if byTerms:
                    # Check if the term exists
                    if termText in freqMtx:
                        # Check if the document exists
                        if docStr in freqMtx[termText]:
                            termCount = int(
                                math.ceil(
                                    ((freqMtx[termText][docStr] * termSize) /
                                     100)))
                            freqMtx[termText].update(
                                {docStr: ((termCount + 1) / termSize) * 100})
                        else:
                            freqMtx[termText].update(
                                {docStr: (1 / termSize) * 100})
                    else:
                        termIdx = {termText: {docStr: (1 / termSize) * 100}}
                        freqMtx.update(termIdx)
                else:
                    # Check if the term exists
                    termText = termText.replace('.', '_')
                    if termText in termDict:
                        termCount = int(
                            math.ceil((termDict[termText] * termSize) / 100))
                        termDict[termText] = ((termCount + 1) / termSize) * 100
                    else:
                        termIdx = {termText: (1 / termSize) * 100}
                        termDict.update(termIdx)
            if not byTerms:
                freqMtx.update({docStr: termDict})
            pB.updateProgress()

        if saveMtx and byTerms:
            self.__saveMatrix(numDocs, freqMtx)

        if scattered and byTerms:
            freqMtx = self.__scatterMatrix(numDocs, freqMtx)

        # Close IndexReader
        reader.close()

        return freqMtx

    def GetSimilarity(self, query, freqMtx):
        """
        Cosine Similarity
        """
        qVector = {}
        qList = self.__stemString(query)
        for stem in qList:
            qVector.update({stem: 0})
        self.__normalize(qVector, freqMtx)

        qList = []
        #Get similarity between query and doc[n]
        for docIdx, dVector in freqMtx.items():
            dP = self.__dotProduct(qVector, dVector)
            qM = self.__magnitude(qVector)
            dM = self.__magnitude(dVector)
            cosSimilarity = dP / (qM * dM)
            qList.append((docIdx, cosSimilarity))

        return sorted(qList,
                      key=lambda similarity: similarity[1],
                      reverse=True)

    def AnalyzeDocument(self, docIdx):
        """
        Generates a list of (entity, relation, entity) tuples as its output.

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        """
        gpeList = {}
        geolocator = Geocode()
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        # Load NLTK Data
        nltkPath = os.path.dirname(
            os.path.realpath(__file__)) + '/../tools/nltk_data'
        nltk.data.path.append(nltkPath)

        # Named Entity Recognition
        content = doc.get(Indexer.CONTENT)
        sentences = nltk.sent_tokenize(content)

        #ProgressBar
        print("Analazing Document {0}".format(docIdx))

        pB = ProgressBar(len(sentences), prefix='Progress:')
        # Loop over each sentence and tokenize it separately
        for sentence in sentences:
            ner = nltk.word_tokenize(sentence)
            ner = nltk.pos_tag(ner)
            ner = nltk.ne_chunk(ner)
            # Get all the Geo-Political Entities
            for subtrees in list(
                    ner.subtrees(
                        filter=lambda subtree: subtree.label() == 'GPE')):
                entityName = ' '.join([child[0] for child in subtrees])
                if entityName not in gpeList:
                    location = geolocator.GetGPE(entityName)
                    if location:
                        gpeList.update(location)
            pB.updateProgress()
        gpeList = geolocator.GetFeatureCollection(gpeList)

        return gpeList

    def GetDocField(self, docIdx, field=CONTENT):
        """
        Get the document's field

        :Parameters:
        - `docIdx`: Document's index ID (Int).
        - `field`: Field to retrieve (Str).

        :Returns:
        - Document's field. (Str)
        """
        reader = DirectoryReader.open(self.__indexDir)
        doc = reader.document(docIdx)
        content = doc.get(field)
        reader.close()

        return content