Exemple #1
0
    def __init__(self, conf):
        self.conf = conf
        self.from_file = ClassFile()

        self._naive_bayes = NaiveBayes(conf)
        self._cnn_network = NNetwork(conf)
        self._voting = Voting(conf)
Exemple #2
0
class Process:
    def __init__(self, conf):
        self.conf = conf
        self.from_file = ClassFile()
        self.from_pdf = Pdf()
        self.from_image = ClassImage()

    def create_examples(self, process_pdf=False):
        path = self.conf.path

        if process_pdf:
            pdf_list = self.from_file.list_files_ext(path, '.pdf')
            self.from_pdf.to_image(pdf_list)

        i_list = self.from_file.list_files_ext(path, '.jpg')
        print(i_list)

        docs = []
        categories = set()

        for image in i_list:
            cropped = self.from_image.crop_image_loaded(
                self.from_image.resize_image_loaded(
                    self.from_image.load_image(image), self.conf.resize_width,
                    self.conf.resize_height), self.conf.crop_width,
                self.conf.crop_height)
            name = self.from_file.get_file_name(image)
            categories.add(name[4:-3])
            ext = self.from_file.get_file_ext(image)
            file = self.from_file.get_dir_name(
                image) + self.conf.sep + name + '_crop' + ext
            # Image.fromarray(cropped).save(file)
            examples = self.from_image.generate_examples(
                cropped, self.conf.examples_per_case)
            i = 0
            directory = self.conf.examples_dir + self.conf.sep + name[4:-3]
            self.from_file.create_dir(directory)
            for example in examples:
                Image.fromarray(example).save(directory + self.conf.sep +
                                              name[4:] + '_' +
                                              str(i).zfill(3) + ext)
                docs.append(Document(example, name[4:-3]))
                i += 1

        self.from_file.list_to_file(
            categories,
            self.conf.examples_dir + self.conf.sep + self.conf.cat_file)

    def create_svh_data(self, process_pdf=False):
        path = self.conf.working_path

        if process_pdf:
            pdf_list = self.from_file.list_files_ext(path, '.pdf')
            self.from_pdf.to_text(pdf_list)
Exemple #3
0
    def __init__(self, conf, nlp):
        self.conf = conf
        self.nlp = nlp
        self.logger.Information('GbcMlDocumentClassifierPrediction::POST - loading dictionary...')
        self.conf.load_dict()
        self.tf = None
        self.tf_idf = None
        self.vectorizer = None
        self.from_file = ClassFile()

        self.logger = loggerElk(__name__, True)
Exemple #4
0
 def check(domain, model, data):
     conf = Configuration()
     if model == Classify.NAIVE_BAYES_MULTI or model == Classify.VOTING:
         return ClassFile.has_text_file(
             os.path.join(conf.base_dir, domain, model, data))
     else:
         return False
Exemple #5
0
 def check(domain, model, data):
     conf = Configuration()
     if model == Classify.CNN_NETWORK:
         return ClassFile.has_media_file(
             os.path.join(conf.base_dir, domain, model, data))
     else:
         return False
Exemple #6
0
    def __init__(self, conf):
        """ Virtually private constructor. """
        if Singleton.__instance is not None:
            raise Exception("This class is a singleton!")
        else:
            self.vectorizers = {}

            files = ClassFile.list_files_ext(conf.base_dir, 'vectorizer.tfidf')
            logger.Information('GbcMlDocumentClassifierPrediction::POST - loading vectorizers...')

            for f in files:
                key = ClassFile.get_containing_dir_name(f)
                logger.Information(f'GbcMlDocumentClassifierPrediction::POST - loading model: {key}...')
                self.vectorizers[key] = ClassFile.load_model(f)
                logger.Information(f'GbcMlDocumentClassifierPrediction::POST - loaded model: ...{key}')

            Singleton.__instance = self
Exemple #7
0
    def load_dict(self):
        print(os.getcwd())
        dict_file = ClassFile.get_text(self.dictionary).split('\n')
        dict_file = filter(lambda x: (x is not ''), dict_file)
        spa_dict = list(map(lambda x: x.lower(), dict_file))
        self.spa_dict = spa_dict

        return spa_dict
Exemple #8
0
    def predict(self, x):
        self.initialize()
        self.load_model()

        y = None
        response = list()
        try:
            class_path = os.path.join(self.conf.base_dir, self.conf.classes)
            class_list = ClassFile.file_to_list(class_path, binary=False)
            predicted = self.clf.classifier.predict_generator(x,
                                                              verbose=1,
                                                              steps=(x.n / 32))
            predicted_class_indices = np.argmax(predicted, axis=1)
            predictions = [class_list[k] for k in predicted_class_indices]

            response.append(predicted)
            response.append(class_list)
            response.append(predictions)
        except Exception as exc:
            pass

        return response
Exemple #9
0
 def save_model(self, model_name):
     try:
         path = os.path.join(self.conf.working_path, model_name)
         ClassFile.save_model(path, self.clf)
     except Exception as exc:
         self.logger.Error("Model file not saved")
Exemple #10
0
 def load_model(self, model_name):
     try:
         path = os.path.join(self.conf.working_path, model_name)
         self.clf = ClassFile.load_model(path)
     except Exception as exc:
         self.logger.Error("Model file not found")
Exemple #11
0
 def __init__(self, conf):
     self.conf = conf
     self.from_file = ClassFile()
     self.from_pdf = Pdf()
     self.from_image = ClassImage()
Exemple #12
0
class Classify:
    NAIVE_BAYES_MULTI = "NAIVE_BAYES_MULTI"
    VOTING = "VOTING"
    CNN_NETWORK = "CNN_NETWORK"

    def __init__(self, conf):
        self.conf = conf
        self.from_file = ClassFile()

        self._naive_bayes = NaiveBayes(conf)
        self._cnn_network = NNetwork(conf)
        self._voting = Voting(conf)

    def get_category(self, gram_path):
        return self.from_file.get_containing_dir_name(gram_path)

    @staticmethod
    def encode_categories(y):
        # encode class values as integers
        encoder = LabelEncoder()
        encoded_Y = encoder.fit_transform(y)
        # convert integers to dummy variables (i.e. one hot encoded)
        dummy_y = to_categorical(encoded_Y).astype(int)
        # print(dummy_y)
        return dummy_y

    @staticmethod
    def show_metrics(y_test, y_predicted, stats=None, show=False):
        print('Accuracy:', accuracy_score(y_test, y_predicted))
        print(metrics.classification_report(y_test, y_predicted))
        stats.info = metrics.classification_report(y_test, y_predicted)
        return stats

    def launch_naive_bayes_complement(self,
                                      X_train=None,
                                      y_train=None,
                                      X_test=None,
                                      train=False):
        print('---< Naive-Bayes Complement >---')
        result = ''
        if train and X_train is not None and y_train is not None:
            self._naive_bayes.train(X_train, y_train, 'complement')
        elif not train and X_test is not None:
            result = self._naive_bayes.predict(X_test, 'complement')

        return result

    def launch_naive_bayes_multinomial(self,
                                       X_train=None,
                                       y_train=None,
                                       X_test=None,
                                       train=False):
        print('---< Naive-Bayes Multinomial >---')
        result = ''
        if train and X_train is not None and y_train is not None:
            self._naive_bayes.train(X_train, y_train, 'multinomial')
        elif not train and X_test is not None:
            result = self._naive_bayes.predict(X_test, 'multinomial')

        return result

    def launch_cnn_network(self,
                           training_set=None,
                           validation_set=None,
                           prediction_set=None,
                           train=False):
        print('---< Nn Network >---')
        result = ''
        if train and training_set is not None and validation_set is not None:
            self._cnn_network.train(training_set, validation_set)
        elif not train and prediction_set is not None:
            result = self._cnn_network.predict(prediction_set)

        return result

    def launch_voting_classifier(self,
                                 X_train=None,
                                 y_train=None,
                                 X_test=None,
                                 train=False):
        print('---< Voting Classifier >---')
        result = ''
        clf_nb = self._naive_bayes.initialize(subtype='complement')
        class_list = [clf_nb]
        if train and X_train is not None and y_train is not None:
            self._voting.train(X_train, y_train, class_list)
        elif not train and X_test is not None:
            result = self._voting.predict(X_test, class_list)

        return result
Exemple #13
0
class PreProcess:

    def __init__(self, conf, nlp):
        self.conf = conf
        self.nlp = nlp
        self.logger.Information('GbcMlDocumentClassifierPrediction::POST - loading dictionary...')
        self.conf.load_dict()
        self.tf = None
        self.tf_idf = None
        self.vectorizer = None
        self.from_file = ClassFile()

        self.logger = loggerElk(__name__, True)

    def process(self, text, kind='none', path=''):
        """
        process svh texts

        """
        xdoc = Document()
        xdoc.kind = kind
        xdoc.path = path

        doc = None
        sentences = []
        if len(text) > self.conf.max_string_size:
            print(len(text))
            split = utils.split_by_size(text, self.conf.max_string_size)
            for t in split:
                doc = self.nlp(t)
                for s in doc.sents:
                    sentences.append(s)
        else:
            doc = self.nlp(text)
            sentences = doc.sents

        # mark stopwords
        for sentence in doc.sents:
            # print(sentence)
            s = Sentence()
            for token in sentence:
                t = stopwords.clean_token(self.conf, token)
                s.add_token(t)

            xdoc.add_sentence(s)

        # build 1-grams (just lemmas)
        for s in xdoc.sentences:
            for t in s.tokens:
                if not t.stop:
                    xdoc.add_gram(t.lemma.lower())

        # print(xdoc.path, ':\n', ' '.join(sorted(list(xdoc.grams))))
        self.from_file.list_to_file(list(xdoc.grams), self.from_file.file_base_name(xdoc.path) + '.gram')
        # print('', ' '.join(files.file_to_list(files.file_base_name(xdoc.path) + '.gram')))
        return xdoc

    def load_vector_models(self):
        self.tf = self.from_file.load_model(os.path.join(self.conf.working_path, self.conf.tf))
        self.tf_idf = self.from_file.load_model(os.path.join(self.conf.working_path, self.conf.tfidf))
        # print(self.tf.vocabulary_)

    def load_vectorizer_model(self, domain):
        self.logger.Information('GbcMlDocumentClassifierPrediction::POST - transform...')
        self.vectorizer = Singleton.getInstance(self.conf).vectorizers[domain]

    def get_tfidf(self, gram):
        count = self.tf.transform([' '.join(gram)])
        vector = self.tf_idf.transform(count)
        # print(vector.toarray()[0].tolist())
        return vector

    def get_tfidf_from_vectorizer(self, gram):
        vector = self.vectorizer.transform([' '.join(gram)])
        # print(vector.toarray()[0].tolist())
        return vector

    def get_count(self, gram):
        count = self.tf.transform([' '.join(gram)])
        # print(count.shape)
        return count

    def transform(self, domain, file):
        if self.vectorizer is None:
            self.load_vectorizer_model(domain)
        text = self.from_file.get_text(file)
        return self.transform_text(text)

    def transform_text(self, text):
        doc = self.process(utils.clean_text(text), 'none')
        # print(doc.grams)
        vector = self.get_tfidf_from_vectorizer(doc.grams)
        # X = [vector.toarray()[0]]
        # X = np.array(X).reshape((1, len(vector.toarray()[0])))
        return vector

    def _do_pre_process(self, q, result):  # q:[[index, text, kind, path], ...]
        """
        launch svh text processing in threads

        """
        while not q.empty():
            work = q.get()  # fetch new work from the Queue
            try:
                print("Requested..." + str(work[0]))
                data = self.process(work[1], work[2], work[3])
                result[work[0]] = data  # Store data back at correct index
                print(".............................. Done " + str(work[0]))
            except Exception as exc:
                result[work[0]] = Document()
                self.logger.Error(exc)

            # signal to the queue that task has been processed
            q.task_done()
        return True

    def _create_dataset(self, docs):
        """
        build the tf and tfidf matrixes for the whole svh text

        """
        text = []
        for doc in docs:
            text.append(doc.get_grams_as_text())

        # create the transform
        # tokenize and build vocab
        count_vectorizer = CountVectorizer()
        x_tf = count_vectorizer.fit_transform(text)
        print(x_tf.shape)

        # idf
        tfidf_transformer = TfidfTransformer()
        x_tfidf = tfidf_transformer.fit_transform(x_tf)
        print(x_tfidf.shape)

        # encode documents
        for doc in docs:
            vector_tf = count_vectorizer.transform([doc.get_grams_as_text()])
            print(vector_tf.shape)
            print(type(vector_tf))
            print(vector_tf.toarray())
            self.from_file.save_sparse_csr(self.from_file.file_base_name(doc.path) + '.tf', vector_tf)

            vector_tfidf = tfidf_transformer.transform(vector_tf)
            print(vector_tfidf.shape)
            print(type(vector_tfidf))
            print(vector_tfidf.toarray())
            self.from_file.save_sparse_csr(self.from_file.file_base_name(doc.path) + '.tfidf', vector_tfidf)

        return x_tf, x_tfidf

    def create_dataset_from_unigrams_direct(self, uni_grams):
        text = []
        for doc_grams in uni_grams:
            if len(doc_grams) == 0:
                print('.< size 0 vector >.')
            else:
                text.append(' '.join(list(doc_grams)))

        # create the transform
        vectorizer = TfidfVectorizer(min_df=1, max_df=0.99)
        x_tfidf = vectorizer.fit_transform(text)
        print('tfidf shape:', x_tfidf.shape)

        self.from_file.save_model(os.path.join(self.conf.working_path, 'vectorizer.tfidf'), vectorizer)

        return vectorizer

    def _create_dataset_from_uni_grams(self, uni_grams):
        """
        Build the tf and tfidf matrixes for the whole svh text loading all .gram files

        """
        text = []
        for doc_grams in uni_grams:
            if len(doc_grams) == 0:
                print('.< size 0 vector >.')
            else:
                text.append(' '.join(list(doc_grams)))

        # create the transform
        # tokenize and build vocab
        count_vectorizer = CountVectorizer()
        x_tf = count_vectorizer.fit_transform(text)
        print(x_tf.shape)

        # idf
        tfidf_transformer = TfidfTransformer()
        x_tfidf = tfidf_transformer.fit_transform(x_tf)
        print(x_tfidf.shape)

        self.from_file.save_model(os.path.join(self.conf.working_path, self.conf.tf), count_vectorizer)
        self.from_file.save_model(os.path.join(self.conf.working_path, self.conf.tfidf), tfidf_transformer)

        return x_tf, x_tfidf

    def pre_process_batches(self):
        """
        Process all svh txt files in batches to get the .grams

        """
        categories = set()
        all_categories = []

        d_list = self.from_file.list_files_ext(self.conf.working_path, ".txt")
        all_docs = [None for d in d_list]
        q = Queue(maxsize=0)

        counter = 0
        total = len(d_list)
        i = 0

        while i < total:
            h = i
            for j in range(self.conf.pre_process_batch_size):
                if h < total:
                    f = d_list[h]
                    category = self.from_file.get_containing_dir_name(f)
                    categories.add(category)

                    all_categories.append(category)
                    text = utils.clean_text(self.from_file.get_text(f))

                    print('doc %s to q' % (counter + 1))
                    q.put((counter, text, category, f))

                    counter += 1
                h += 1

            for j in range(q.qsize()):
                worker = Thread(target=self._do_pre_process, args=(q, all_docs))
                worker.setDaemon(True)  # setting threads as "daemon" allows main program to
                # exit eventually even if these dont finish
                # correctly.
                worker.start()

            # now we wait until the queue has been processed
            q.join()

            q.empty()
            i = h

        print(len(categories), categories)
        # create_dataset(conf, all_docs)

    def create_full_dataset_vectorizer(self):
        """
        Load all .gram files and call create_dataset_from_unigrams

        """
        v_list = self.from_file.list_files_ext(self.conf.working_path, ".gram")
        unigrams = []
        print(v_list)

        for f in v_list:
            unigrams.append(self.from_file.file_to_list(f))

        self.create_dataset_from_unigrams_direct(unigrams)

    def _pre_process(self):
        """
        Process all svh txt files to get the .grams

        """
        categories = set()
        all_categories = []

        d_list = self.from_file.list_files_ext(self.conf.working_path, "txt")
        all_docs = [None for d in d_list]
        q = Queue(maxsize=0)

        counter = 0
        total = len(d_list)
        cumul = 0
        for f in d_list:
            category = self.from_file.get_containing_dir_name(f)
            categories.add(category)

            all_categories.append(category)
            text = utils.clean_text(self.from_file.get_text(f))

            print('doc %s to q' % (counter + 1))
            q.put((counter, text, category, f))

            counter += 1

        for i in range(total):
            worker = Thread(target=self._do_pre_process, args=(q, all_docs))
            worker.setDaemon(True)  # setting threads as "daemon" allows main program to
            # exit eventually even if these dont finish
            # correctly.
            worker.start()
        # now we wait until the queue has been processed
        q.join()

        print(len(categories), categories)
        # create_dataset(conf, all_docs)

    @staticmethod
    def test_pre_process():
        # text = "Las niñas juegan en los Estados Unidos. El Tío Sam observa a los niños. " \
        #        "Yo bajo con el hombre bajo a tocar el bajo bajo la escalera. " \
        #        "Yo bajo el volumen de los niños."

        text = ["Los niños juegan en los Estados Unidos. El Tío Sam observa a los niños.",
                " Yo bajo con el hombre bajo a tocar el bajo bajo la escalera.",
                " Yo bajo el volumen de los niños."]

        conf = Configuration()
        nlp = SpacyModel.getInstance().model
        conf.load_dict()
        process = PreProcess(conf, nlp)

        # process._pre_process()
        process.pre_process_batches()
        process.create_full_dataset_vectorizer()