def preprocess_all(self, raw_files): ''' Converts the raw files into the documents. For each document creates tokens and bag of words. Args: raw_files: dictionary[document_key] = document_content ''' logger.info("Preprocessing...") self.documents = preprocess_all(raw_files) self.docs_bag = bag_of_documents(self.documents)
def preprocess_all(self, raw_files): ''' Converts the raw files into the documents. Procedure: 1. creates documents 2. gets number of documents 3. for each term determine all documents in which that term exists and how many times it occurs (docs_bag) Args: raw_files: dict (key = document key, value = document text) ''' logger.info("Preprocessing...") self.documents = preprocess_all(raw_files) self.docs_no = len(self.documents) self.docs_bag = bag_of_documents(self.documents)
def preprocess_all(self, raw_files): ''' Converts the raw files into the documents and executes preprocess algorithm on all of them. Calculates tf, idf, tf * idf. Args: raw_files: dict[identifier] = text ''' logger.info("Preprocessing...") self.documents = preprocess_all(raw_files) self.docs_no = len(self.documents) self.determine_idf() self.determine_tf() self.tf_idf = self.tf.multiply(self.idf)