Exemple #1
0
    def preprocess_one(self, raw_file):
        '''
        Takes single document (raw_file)
        and calculates all neccessary to incorporate
        that document into the existing set of documents.

        Args:
            raw_file: text (string)
        '''
        logger.info("Preprocessing one...")

        # create document
        document = preprocess_one(raw_file)
        if document.identifier in self.documents:
            logger.info("Document already exists.")
            return

        # update documents
        self.documents[document.identifier] = document
        self.docs_no = len(self.documents)
        self.iterative_docs.append(document.identifier)

        # update tokens
        for token, occurrence in document.bag.items():
            if token not in self.tokens:
                self.tokens[token] = self.tokens_no
                self.tokens_no += 1

        new_tf_shape = (self.docs_no, self.tokens_no)
        new_idf_shape = (1, self.tokens_no)

        # resize matrices
        tf_lil = resize_tf(self.tf, new_tf_shape)
        idf_lil = resize_idf(self.idf, new_idf_shape)

        # update tf
        max_freq = document.bag[max(document.bag, key=document.bag.get)]
        for token, freq in document.bag.items():
            token_index = self.tokens[token]
            doc_index = self.docs_no - 1
            tf_lil[doc_index, token_index] = tf(freq, max_freq)

        # update idf
        for token in document.bag:
            index = self.tokens[token]
            idf_lil[0, index] += 1

        # convert back to csr (faster multiplication)
        self.tf = tf_lil.tocsr()
        self.idf = idf_lil.tocsr()

        # TODO: multiply only last row
        self.tf_idf = self.tf.multiply(self.idf)
Exemple #2
0
    def preprocess_one(self, raw_file):
        '''
        Takes single document (raw_file)
        and calculates all neccessary to incorporate
        that document into the existing set of documents.

        Args:
            raw_file: text (string)
        '''
        logger.info("Preprocessing one...")

        # create document
        document = preprocess_one(raw_file)
        if document.identifier in self.documents:
            logger.info("Document already exists.")
            return

        # update documents
        self.documents[document.identifier] = document
        self.docs_no = len(self.documents)
        self.iterative_docs.append(document.identifier)

        # update tokens
        for token, occurrence in document.bag.items():
            if token not in self.tokens:
                self.tokens[token] = self.tokens_no
                self.tokens_no += 1

        new_tf_shape = (self.docs_no, self.tokens_no)
        new_idf_shape = (1, self.tokens_no)

        # resize matrices
        tf_lil = resize_tf(self.tf, new_tf_shape)
        idf_lil = resize_idf(self.idf, new_idf_shape)

        # update tf
        max_freq = document.bag[max(document.bag, key=document.bag.get)]
        for token, freq in document.bag.items():
            token_index = self.tokens[token]
            doc_index = self.docs_no - 1
            tf_lil[doc_index, token_index] = tf(freq, max_freq)

        # update idf
        for token in document.bag:
            index = self.tokens[token]
            idf_lil[0, index] += 1

        # convert back to csr (faster multiplication)
        self.tf = tf_lil.tocsr()
        self.idf = idf_lil.tocsr()

        # TODO: multiply only last row
        self.tf_idf = self.tf.multiply(self.idf)
    def preprocess_one(self, raw_file):
        '''
        Takes single document (raw_file)
        and calculates all that is neccessary to incorporate
        that document into the existing set of documents.

        Args:
            raw_file: text (string)
        '''
        logger.info("Preprocessing one...")

        # create document
        document = preprocess_one(raw_file)

        # update docs_bag dict
        for token, occurrences in document.bag.items():
            docs_number = self.docs_bag.get(token, {})
            docs_number[document.identifier] = occurrences
            self.docs_bag[token] = docs_number

        # add the document to the documents set
        self.documents[document.identifier] = document
    def preprocess_one(self, raw_file):
        '''
        Takes single document (raw_file)
        and calculates all that is neccessary to incorporate
        that document into the existing set of documents.

        Args:
            raw_file: text (string)
        '''
        logger.info("Preprocessing one...")

        # create document
        document = preprocess_one(raw_file)

        # update docs_bag dict
        for token, occurrences in document.bag.items():
            docs_number = self.docs_bag.get(token, {})
            docs_number[document.identifier] = occurrences
            self.docs_bag[token] = docs_number

        # add the document to the documents set
        self.documents[document.identifier] = document