Exemple #1
0
    def predict(self, x):
        self.initialize()
        self.load_model()

        y = None
        response = list()
        try:
            class_path = os.path.join(self.conf.base_dir, self.conf.classes)
            class_list = ClassFile.file_to_list(class_path, binary=False)
            predicted = self.clf.classifier.predict_generator(x,
                                                              verbose=1,
                                                              steps=(x.n / 32))
            predicted_class_indices = np.argmax(predicted, axis=1)
            predictions = [class_list[k] for k in predicted_class_indices]

            response.append(predicted)
            response.append(class_list)
            response.append(predictions)
        except Exception as exc:
            pass

        return response
Exemple #2
0
class PreProcess:

    def __init__(self, conf, nlp):
        self.conf = conf
        self.nlp = nlp
        self.logger.Information('GbcMlDocumentClassifierPrediction::POST - loading dictionary...')
        self.conf.load_dict()
        self.tf = None
        self.tf_idf = None
        self.vectorizer = None
        self.from_file = ClassFile()

        self.logger = loggerElk(__name__, True)

    def process(self, text, kind='none', path=''):
        """
        process svh texts

        """
        xdoc = Document()
        xdoc.kind = kind
        xdoc.path = path

        doc = None
        sentences = []
        if len(text) > self.conf.max_string_size:
            print(len(text))
            split = utils.split_by_size(text, self.conf.max_string_size)
            for t in split:
                doc = self.nlp(t)
                for s in doc.sents:
                    sentences.append(s)
        else:
            doc = self.nlp(text)
            sentences = doc.sents

        # mark stopwords
        for sentence in doc.sents:
            # print(sentence)
            s = Sentence()
            for token in sentence:
                t = stopwords.clean_token(self.conf, token)
                s.add_token(t)

            xdoc.add_sentence(s)

        # build 1-grams (just lemmas)
        for s in xdoc.sentences:
            for t in s.tokens:
                if not t.stop:
                    xdoc.add_gram(t.lemma.lower())

        # print(xdoc.path, ':\n', ' '.join(sorted(list(xdoc.grams))))
        self.from_file.list_to_file(list(xdoc.grams), self.from_file.file_base_name(xdoc.path) + '.gram')
        # print('', ' '.join(files.file_to_list(files.file_base_name(xdoc.path) + '.gram')))
        return xdoc

    def load_vector_models(self):
        self.tf = self.from_file.load_model(os.path.join(self.conf.working_path, self.conf.tf))
        self.tf_idf = self.from_file.load_model(os.path.join(self.conf.working_path, self.conf.tfidf))
        # print(self.tf.vocabulary_)

    def load_vectorizer_model(self, domain):
        self.logger.Information('GbcMlDocumentClassifierPrediction::POST - transform...')
        self.vectorizer = Singleton.getInstance(self.conf).vectorizers[domain]

    def get_tfidf(self, gram):
        count = self.tf.transform([' '.join(gram)])
        vector = self.tf_idf.transform(count)
        # print(vector.toarray()[0].tolist())
        return vector

    def get_tfidf_from_vectorizer(self, gram):
        vector = self.vectorizer.transform([' '.join(gram)])
        # print(vector.toarray()[0].tolist())
        return vector

    def get_count(self, gram):
        count = self.tf.transform([' '.join(gram)])
        # print(count.shape)
        return count

    def transform(self, domain, file):
        if self.vectorizer is None:
            self.load_vectorizer_model(domain)
        text = self.from_file.get_text(file)
        return self.transform_text(text)

    def transform_text(self, text):
        doc = self.process(utils.clean_text(text), 'none')
        # print(doc.grams)
        vector = self.get_tfidf_from_vectorizer(doc.grams)
        # X = [vector.toarray()[0]]
        # X = np.array(X).reshape((1, len(vector.toarray()[0])))
        return vector

    def _do_pre_process(self, q, result):  # q:[[index, text, kind, path], ...]
        """
        launch svh text processing in threads

        """
        while not q.empty():
            work = q.get()  # fetch new work from the Queue
            try:
                print("Requested..." + str(work[0]))
                data = self.process(work[1], work[2], work[3])
                result[work[0]] = data  # Store data back at correct index
                print(".............................. Done " + str(work[0]))
            except Exception as exc:
                result[work[0]] = Document()
                self.logger.Error(exc)

            # signal to the queue that task has been processed
            q.task_done()
        return True

    def _create_dataset(self, docs):
        """
        build the tf and tfidf matrixes for the whole svh text

        """
        text = []
        for doc in docs:
            text.append(doc.get_grams_as_text())

        # create the transform
        # tokenize and build vocab
        count_vectorizer = CountVectorizer()
        x_tf = count_vectorizer.fit_transform(text)
        print(x_tf.shape)

        # idf
        tfidf_transformer = TfidfTransformer()
        x_tfidf = tfidf_transformer.fit_transform(x_tf)
        print(x_tfidf.shape)

        # encode documents
        for doc in docs:
            vector_tf = count_vectorizer.transform([doc.get_grams_as_text()])
            print(vector_tf.shape)
            print(type(vector_tf))
            print(vector_tf.toarray())
            self.from_file.save_sparse_csr(self.from_file.file_base_name(doc.path) + '.tf', vector_tf)

            vector_tfidf = tfidf_transformer.transform(vector_tf)
            print(vector_tfidf.shape)
            print(type(vector_tfidf))
            print(vector_tfidf.toarray())
            self.from_file.save_sparse_csr(self.from_file.file_base_name(doc.path) + '.tfidf', vector_tfidf)

        return x_tf, x_tfidf

    def create_dataset_from_unigrams_direct(self, uni_grams):
        text = []
        for doc_grams in uni_grams:
            if len(doc_grams) == 0:
                print('.< size 0 vector >.')
            else:
                text.append(' '.join(list(doc_grams)))

        # create the transform
        vectorizer = TfidfVectorizer(min_df=1, max_df=0.99)
        x_tfidf = vectorizer.fit_transform(text)
        print('tfidf shape:', x_tfidf.shape)

        self.from_file.save_model(os.path.join(self.conf.working_path, 'vectorizer.tfidf'), vectorizer)

        return vectorizer

    def _create_dataset_from_uni_grams(self, uni_grams):
        """
        Build the tf and tfidf matrixes for the whole svh text loading all .gram files

        """
        text = []
        for doc_grams in uni_grams:
            if len(doc_grams) == 0:
                print('.< size 0 vector >.')
            else:
                text.append(' '.join(list(doc_grams)))

        # create the transform
        # tokenize and build vocab
        count_vectorizer = CountVectorizer()
        x_tf = count_vectorizer.fit_transform(text)
        print(x_tf.shape)

        # idf
        tfidf_transformer = TfidfTransformer()
        x_tfidf = tfidf_transformer.fit_transform(x_tf)
        print(x_tfidf.shape)

        self.from_file.save_model(os.path.join(self.conf.working_path, self.conf.tf), count_vectorizer)
        self.from_file.save_model(os.path.join(self.conf.working_path, self.conf.tfidf), tfidf_transformer)

        return x_tf, x_tfidf

    def pre_process_batches(self):
        """
        Process all svh txt files in batches to get the .grams

        """
        categories = set()
        all_categories = []

        d_list = self.from_file.list_files_ext(self.conf.working_path, ".txt")
        all_docs = [None for d in d_list]
        q = Queue(maxsize=0)

        counter = 0
        total = len(d_list)
        i = 0

        while i < total:
            h = i
            for j in range(self.conf.pre_process_batch_size):
                if h < total:
                    f = d_list[h]
                    category = self.from_file.get_containing_dir_name(f)
                    categories.add(category)

                    all_categories.append(category)
                    text = utils.clean_text(self.from_file.get_text(f))

                    print('doc %s to q' % (counter + 1))
                    q.put((counter, text, category, f))

                    counter += 1
                h += 1

            for j in range(q.qsize()):
                worker = Thread(target=self._do_pre_process, args=(q, all_docs))
                worker.setDaemon(True)  # setting threads as "daemon" allows main program to
                # exit eventually even if these dont finish
                # correctly.
                worker.start()

            # now we wait until the queue has been processed
            q.join()

            q.empty()
            i = h

        print(len(categories), categories)
        # create_dataset(conf, all_docs)

    def create_full_dataset_vectorizer(self):
        """
        Load all .gram files and call create_dataset_from_unigrams

        """
        v_list = self.from_file.list_files_ext(self.conf.working_path, ".gram")
        unigrams = []
        print(v_list)

        for f in v_list:
            unigrams.append(self.from_file.file_to_list(f))

        self.create_dataset_from_unigrams_direct(unigrams)

    def _pre_process(self):
        """
        Process all svh txt files to get the .grams

        """
        categories = set()
        all_categories = []

        d_list = self.from_file.list_files_ext(self.conf.working_path, "txt")
        all_docs = [None for d in d_list]
        q = Queue(maxsize=0)

        counter = 0
        total = len(d_list)
        cumul = 0
        for f in d_list:
            category = self.from_file.get_containing_dir_name(f)
            categories.add(category)

            all_categories.append(category)
            text = utils.clean_text(self.from_file.get_text(f))

            print('doc %s to q' % (counter + 1))
            q.put((counter, text, category, f))

            counter += 1

        for i in range(total):
            worker = Thread(target=self._do_pre_process, args=(q, all_docs))
            worker.setDaemon(True)  # setting threads as "daemon" allows main program to
            # exit eventually even if these dont finish
            # correctly.
            worker.start()
        # now we wait until the queue has been processed
        q.join()

        print(len(categories), categories)
        # create_dataset(conf, all_docs)

    @staticmethod
    def test_pre_process():
        # text = "Las niñas juegan en los Estados Unidos. El Tío Sam observa a los niños. " \
        #        "Yo bajo con el hombre bajo a tocar el bajo bajo la escalera. " \
        #        "Yo bajo el volumen de los niños."

        text = ["Los niños juegan en los Estados Unidos. El Tío Sam observa a los niños.",
                " Yo bajo con el hombre bajo a tocar el bajo bajo la escalera.",
                " Yo bajo el volumen de los niños."]

        conf = Configuration()
        nlp = SpacyModel.getInstance().model
        conf.load_dict()
        process = PreProcess(conf, nlp)

        # process._pre_process()
        process.pre_process_batches()
        process.create_full_dataset_vectorizer()