def __init__(self, conf): self.conf = conf self.from_file = ClassFile() self._naive_bayes = NaiveBayes(conf) self._cnn_network = NNetwork(conf) self._voting = Voting(conf)
class Process: def __init__(self, conf): self.conf = conf self.from_file = ClassFile() self.from_pdf = Pdf() self.from_image = ClassImage() def create_examples(self, process_pdf=False): path = self.conf.path if process_pdf: pdf_list = self.from_file.list_files_ext(path, '.pdf') self.from_pdf.to_image(pdf_list) i_list = self.from_file.list_files_ext(path, '.jpg') print(i_list) docs = [] categories = set() for image in i_list: cropped = self.from_image.crop_image_loaded( self.from_image.resize_image_loaded( self.from_image.load_image(image), self.conf.resize_width, self.conf.resize_height), self.conf.crop_width, self.conf.crop_height) name = self.from_file.get_file_name(image) categories.add(name[4:-3]) ext = self.from_file.get_file_ext(image) file = self.from_file.get_dir_name( image) + self.conf.sep + name + '_crop' + ext # Image.fromarray(cropped).save(file) examples = self.from_image.generate_examples( cropped, self.conf.examples_per_case) i = 0 directory = self.conf.examples_dir + self.conf.sep + name[4:-3] self.from_file.create_dir(directory) for example in examples: Image.fromarray(example).save(directory + self.conf.sep + name[4:] + '_' + str(i).zfill(3) + ext) docs.append(Document(example, name[4:-3])) i += 1 self.from_file.list_to_file( categories, self.conf.examples_dir + self.conf.sep + self.conf.cat_file) def create_svh_data(self, process_pdf=False): path = self.conf.working_path if process_pdf: pdf_list = self.from_file.list_files_ext(path, '.pdf') self.from_pdf.to_text(pdf_list)
def __init__(self, conf, nlp): self.conf = conf self.nlp = nlp self.logger.Information('GbcMlDocumentClassifierPrediction::POST - loading dictionary...') self.conf.load_dict() self.tf = None self.tf_idf = None self.vectorizer = None self.from_file = ClassFile() self.logger = loggerElk(__name__, True)
def check(domain, model, data): conf = Configuration() if model == Classify.NAIVE_BAYES_MULTI or model == Classify.VOTING: return ClassFile.has_text_file( os.path.join(conf.base_dir, domain, model, data)) else: return False
def check(domain, model, data): conf = Configuration() if model == Classify.CNN_NETWORK: return ClassFile.has_media_file( os.path.join(conf.base_dir, domain, model, data)) else: return False
def __init__(self, conf): """ Virtually private constructor. """ if Singleton.__instance is not None: raise Exception("This class is a singleton!") else: self.vectorizers = {} files = ClassFile.list_files_ext(conf.base_dir, 'vectorizer.tfidf') logger.Information('GbcMlDocumentClassifierPrediction::POST - loading vectorizers...') for f in files: key = ClassFile.get_containing_dir_name(f) logger.Information(f'GbcMlDocumentClassifierPrediction::POST - loading model: {key}...') self.vectorizers[key] = ClassFile.load_model(f) logger.Information(f'GbcMlDocumentClassifierPrediction::POST - loaded model: ...{key}') Singleton.__instance = self
def load_dict(self): print(os.getcwd()) dict_file = ClassFile.get_text(self.dictionary).split('\n') dict_file = filter(lambda x: (x is not ''), dict_file) spa_dict = list(map(lambda x: x.lower(), dict_file)) self.spa_dict = spa_dict return spa_dict
def predict(self, x): self.initialize() self.load_model() y = None response = list() try: class_path = os.path.join(self.conf.base_dir, self.conf.classes) class_list = ClassFile.file_to_list(class_path, binary=False) predicted = self.clf.classifier.predict_generator(x, verbose=1, steps=(x.n / 32)) predicted_class_indices = np.argmax(predicted, axis=1) predictions = [class_list[k] for k in predicted_class_indices] response.append(predicted) response.append(class_list) response.append(predictions) except Exception as exc: pass return response
def save_model(self, model_name): try: path = os.path.join(self.conf.working_path, model_name) ClassFile.save_model(path, self.clf) except Exception as exc: self.logger.Error("Model file not saved")
def load_model(self, model_name): try: path = os.path.join(self.conf.working_path, model_name) self.clf = ClassFile.load_model(path) except Exception as exc: self.logger.Error("Model file not found")
def __init__(self, conf): self.conf = conf self.from_file = ClassFile() self.from_pdf = Pdf() self.from_image = ClassImage()
class Classify: NAIVE_BAYES_MULTI = "NAIVE_BAYES_MULTI" VOTING = "VOTING" CNN_NETWORK = "CNN_NETWORK" def __init__(self, conf): self.conf = conf self.from_file = ClassFile() self._naive_bayes = NaiveBayes(conf) self._cnn_network = NNetwork(conf) self._voting = Voting(conf) def get_category(self, gram_path): return self.from_file.get_containing_dir_name(gram_path) @staticmethod def encode_categories(y): # encode class values as integers encoder = LabelEncoder() encoded_Y = encoder.fit_transform(y) # convert integers to dummy variables (i.e. one hot encoded) dummy_y = to_categorical(encoded_Y).astype(int) # print(dummy_y) return dummy_y @staticmethod def show_metrics(y_test, y_predicted, stats=None, show=False): print('Accuracy:', accuracy_score(y_test, y_predicted)) print(metrics.classification_report(y_test, y_predicted)) stats.info = metrics.classification_report(y_test, y_predicted) return stats def launch_naive_bayes_complement(self, X_train=None, y_train=None, X_test=None, train=False): print('---< Naive-Bayes Complement >---') result = '' if train and X_train is not None and y_train is not None: self._naive_bayes.train(X_train, y_train, 'complement') elif not train and X_test is not None: result = self._naive_bayes.predict(X_test, 'complement') return result def launch_naive_bayes_multinomial(self, X_train=None, y_train=None, X_test=None, train=False): print('---< Naive-Bayes Multinomial >---') result = '' if train and X_train is not None and y_train is not None: self._naive_bayes.train(X_train, y_train, 'multinomial') elif not train and X_test is not None: result = self._naive_bayes.predict(X_test, 'multinomial') return result def launch_cnn_network(self, training_set=None, validation_set=None, prediction_set=None, train=False): print('---< Nn Network >---') result = '' if train and training_set is not None and validation_set is not None: self._cnn_network.train(training_set, validation_set) elif not train and prediction_set is not None: result = self._cnn_network.predict(prediction_set) return result def launch_voting_classifier(self, X_train=None, y_train=None, X_test=None, train=False): print('---< Voting Classifier >---') result = '' clf_nb = self._naive_bayes.initialize(subtype='complement') class_list = [clf_nb] if train and X_train is not None and y_train is not None: self._voting.train(X_train, y_train, class_list) elif not train and X_test is not None: result = self._voting.predict(X_test, class_list) return result
class PreProcess: def __init__(self, conf, nlp): self.conf = conf self.nlp = nlp self.logger.Information('GbcMlDocumentClassifierPrediction::POST - loading dictionary...') self.conf.load_dict() self.tf = None self.tf_idf = None self.vectorizer = None self.from_file = ClassFile() self.logger = loggerElk(__name__, True) def process(self, text, kind='none', path=''): """ process svh texts """ xdoc = Document() xdoc.kind = kind xdoc.path = path doc = None sentences = [] if len(text) > self.conf.max_string_size: print(len(text)) split = utils.split_by_size(text, self.conf.max_string_size) for t in split: doc = self.nlp(t) for s in doc.sents: sentences.append(s) else: doc = self.nlp(text) sentences = doc.sents # mark stopwords for sentence in doc.sents: # print(sentence) s = Sentence() for token in sentence: t = stopwords.clean_token(self.conf, token) s.add_token(t) xdoc.add_sentence(s) # build 1-grams (just lemmas) for s in xdoc.sentences: for t in s.tokens: if not t.stop: xdoc.add_gram(t.lemma.lower()) # print(xdoc.path, ':\n', ' '.join(sorted(list(xdoc.grams)))) self.from_file.list_to_file(list(xdoc.grams), self.from_file.file_base_name(xdoc.path) + '.gram') # print('', ' '.join(files.file_to_list(files.file_base_name(xdoc.path) + '.gram'))) return xdoc def load_vector_models(self): self.tf = self.from_file.load_model(os.path.join(self.conf.working_path, self.conf.tf)) self.tf_idf = self.from_file.load_model(os.path.join(self.conf.working_path, self.conf.tfidf)) # print(self.tf.vocabulary_) def load_vectorizer_model(self, domain): self.logger.Information('GbcMlDocumentClassifierPrediction::POST - transform...') self.vectorizer = Singleton.getInstance(self.conf).vectorizers[domain] def get_tfidf(self, gram): count = self.tf.transform([' '.join(gram)]) vector = self.tf_idf.transform(count) # print(vector.toarray()[0].tolist()) return vector def get_tfidf_from_vectorizer(self, gram): vector = self.vectorizer.transform([' '.join(gram)]) # print(vector.toarray()[0].tolist()) return vector def get_count(self, gram): count = self.tf.transform([' '.join(gram)]) # print(count.shape) return count def transform(self, domain, file): if self.vectorizer is None: self.load_vectorizer_model(domain) text = self.from_file.get_text(file) return self.transform_text(text) def transform_text(self, text): doc = self.process(utils.clean_text(text), 'none') # print(doc.grams) vector = self.get_tfidf_from_vectorizer(doc.grams) # X = [vector.toarray()[0]] # X = np.array(X).reshape((1, len(vector.toarray()[0]))) return vector def _do_pre_process(self, q, result): # q:[[index, text, kind, path], ...] """ launch svh text processing in threads """ while not q.empty(): work = q.get() # fetch new work from the Queue try: print("Requested..." + str(work[0])) data = self.process(work[1], work[2], work[3]) result[work[0]] = data # Store data back at correct index print(".............................. Done " + str(work[0])) except Exception as exc: result[work[0]] = Document() self.logger.Error(exc) # signal to the queue that task has been processed q.task_done() return True def _create_dataset(self, docs): """ build the tf and tfidf matrixes for the whole svh text """ text = [] for doc in docs: text.append(doc.get_grams_as_text()) # create the transform # tokenize and build vocab count_vectorizer = CountVectorizer() x_tf = count_vectorizer.fit_transform(text) print(x_tf.shape) # idf tfidf_transformer = TfidfTransformer() x_tfidf = tfidf_transformer.fit_transform(x_tf) print(x_tfidf.shape) # encode documents for doc in docs: vector_tf = count_vectorizer.transform([doc.get_grams_as_text()]) print(vector_tf.shape) print(type(vector_tf)) print(vector_tf.toarray()) self.from_file.save_sparse_csr(self.from_file.file_base_name(doc.path) + '.tf', vector_tf) vector_tfidf = tfidf_transformer.transform(vector_tf) print(vector_tfidf.shape) print(type(vector_tfidf)) print(vector_tfidf.toarray()) self.from_file.save_sparse_csr(self.from_file.file_base_name(doc.path) + '.tfidf', vector_tfidf) return x_tf, x_tfidf def create_dataset_from_unigrams_direct(self, uni_grams): text = [] for doc_grams in uni_grams: if len(doc_grams) == 0: print('.< size 0 vector >.') else: text.append(' '.join(list(doc_grams))) # create the transform vectorizer = TfidfVectorizer(min_df=1, max_df=0.99) x_tfidf = vectorizer.fit_transform(text) print('tfidf shape:', x_tfidf.shape) self.from_file.save_model(os.path.join(self.conf.working_path, 'vectorizer.tfidf'), vectorizer) return vectorizer def _create_dataset_from_uni_grams(self, uni_grams): """ Build the tf and tfidf matrixes for the whole svh text loading all .gram files """ text = [] for doc_grams in uni_grams: if len(doc_grams) == 0: print('.< size 0 vector >.') else: text.append(' '.join(list(doc_grams))) # create the transform # tokenize and build vocab count_vectorizer = CountVectorizer() x_tf = count_vectorizer.fit_transform(text) print(x_tf.shape) # idf tfidf_transformer = TfidfTransformer() x_tfidf = tfidf_transformer.fit_transform(x_tf) print(x_tfidf.shape) self.from_file.save_model(os.path.join(self.conf.working_path, self.conf.tf), count_vectorizer) self.from_file.save_model(os.path.join(self.conf.working_path, self.conf.tfidf), tfidf_transformer) return x_tf, x_tfidf def pre_process_batches(self): """ Process all svh txt files in batches to get the .grams """ categories = set() all_categories = [] d_list = self.from_file.list_files_ext(self.conf.working_path, ".txt") all_docs = [None for d in d_list] q = Queue(maxsize=0) counter = 0 total = len(d_list) i = 0 while i < total: h = i for j in range(self.conf.pre_process_batch_size): if h < total: f = d_list[h] category = self.from_file.get_containing_dir_name(f) categories.add(category) all_categories.append(category) text = utils.clean_text(self.from_file.get_text(f)) print('doc %s to q' % (counter + 1)) q.put((counter, text, category, f)) counter += 1 h += 1 for j in range(q.qsize()): worker = Thread(target=self._do_pre_process, args=(q, all_docs)) worker.setDaemon(True) # setting threads as "daemon" allows main program to # exit eventually even if these dont finish # correctly. worker.start() # now we wait until the queue has been processed q.join() q.empty() i = h print(len(categories), categories) # create_dataset(conf, all_docs) def create_full_dataset_vectorizer(self): """ Load all .gram files and call create_dataset_from_unigrams """ v_list = self.from_file.list_files_ext(self.conf.working_path, ".gram") unigrams = [] print(v_list) for f in v_list: unigrams.append(self.from_file.file_to_list(f)) self.create_dataset_from_unigrams_direct(unigrams) def _pre_process(self): """ Process all svh txt files to get the .grams """ categories = set() all_categories = [] d_list = self.from_file.list_files_ext(self.conf.working_path, "txt") all_docs = [None for d in d_list] q = Queue(maxsize=0) counter = 0 total = len(d_list) cumul = 0 for f in d_list: category = self.from_file.get_containing_dir_name(f) categories.add(category) all_categories.append(category) text = utils.clean_text(self.from_file.get_text(f)) print('doc %s to q' % (counter + 1)) q.put((counter, text, category, f)) counter += 1 for i in range(total): worker = Thread(target=self._do_pre_process, args=(q, all_docs)) worker.setDaemon(True) # setting threads as "daemon" allows main program to # exit eventually even if these dont finish # correctly. worker.start() # now we wait until the queue has been processed q.join() print(len(categories), categories) # create_dataset(conf, all_docs) @staticmethod def test_pre_process(): # text = "Las niñas juegan en los Estados Unidos. El Tío Sam observa a los niños. " \ # "Yo bajo con el hombre bajo a tocar el bajo bajo la escalera. " \ # "Yo bajo el volumen de los niños." text = ["Los niños juegan en los Estados Unidos. El Tío Sam observa a los niños.", " Yo bajo con el hombre bajo a tocar el bajo bajo la escalera.", " Yo bajo el volumen de los niños."] conf = Configuration() nlp = SpacyModel.getInstance().model conf.load_dict() process = PreProcess(conf, nlp) # process._pre_process() process.pre_process_batches() process.create_full_dataset_vectorizer()