Beispiel #1
0
    def extraction(self, filename, ftype, delete_file=True):
        pprocess = False
        output_file = None

        test_file = self.create_test_tagging(filename, ftype)
        results = self.dtn_doc._results

        model_name = crf_utils.get_crf_model_name(self.categorie)
        if not model_name and self.ftype:
            model_name = crf_utils.get_crf_model_name(self.ftype)

        if model_name:
            out_path = crf_utils.get_crf_training_directory()
            model = os.path.join(out_path,
                                 model_name + crf_utils.CRF_MODEL_EXT)

            if pprocess:
                command_line = crf_utils.CRF_TEST + ' -m "%s" "%s"' % (
                    model, test_file)
            else:
                output_file = test_file + ".out"
                command_line = crf_utils.CRF_TEST + ' -m "%s" -o "%s" "%s"' % (
                    model, output_file, test_file)

            command_line = command_line.replace("\\", "/")
            dtn_logger.logger_info("CRF", command_line)

            # run CRF++
            if pprocess:
                args = shlex.split(command_line)
                p = Popen(args,
                          shell=True,
                          bufsize=4096,
                          stdin=PIPE,
                          stdout=PIPE,
                          stderr=PIPE)
                output = p.communicate()
                if (p.returncode == 0 or p.returncode == 255):
                    #self._parse_output(output, results)
                    self._parse_buffer(output, results)

                    dtn_logger.logger_info(
                        "CRF", "crf test exit code : %d" % (p.returncode))
                else:
                    dtn_logger.logger_error(
                        "CRF", "crf test exit code : %d" % (p.returncode))
            else:
                os.system(command_line)
                self._parse_file(output_file, results)

        else:
            dtn_logger.logger_error(
                "CRF", "crf model file is not found (%s)" % (ftype))
        ''' remove test file '''

        if delete_file:
            os.remove(test_file)
            if output_file:
                os.remove(output_file)
        return results
Beispiel #2
0
    def create_embedding_words(self, path, created=True):

        fname = md.get_model_embedded_file(self.MODEL_NAME)
        tw = Text4Words()
        if created:
            dtn_logger.logger_info("TRAINING",
                                   "Create Embedded Words " + fname)
            tw.load_directory(path)
            tw.train_word_vector_embedding(vectfname=fname)
        else:
            tw.load_word_vector_embedding(vectfname=fname)
Beispiel #3
0
    def load_word_vector_embedding(self, vectfname=None):
        # word to vector file name
        if vectfname == None:
            vectfname = util.get_default_wordvect_file()

        self._model = KeyedVectors.load_word2vec_format(vectfname,
                                                        binary=False)
        # summarize vocabulary size in model
        words = list(self._model.wv.vocab)

        logging.logger_info(
            "Embedded", 'load vocabulary : %d (%s)' % (len(words), vectfname))
Beispiel #4
0
    def load_embedding_words(self):

        embeddings_index = {}
        fname = md.get_model_embedded_file(self.MODEL_NAME)
        dtn_logger.logger_info("TRAINING", "Load Embedded Words " + fname)
        f = codecs.open(os.path.join(fname), 'r', 'utf-8')
        for line in f:
            values = line.split()
            if len(values) > 2:
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        f.close()

        return embeddings_index
Beispiel #5
0
    def add_clause(self, clausetype, name, value):

        filename = self._get_clause_file_name(clausetype)
        if os.path.exists(filename):
            f = codecs.open(filename, 'a+', 'utf-8')
        else:
            f = codecs.open(filename, 'w', 'utf-8')

        dtn_logger.logger_info("Clause",
                               "add clause %s : %s" % (clausetype, name))

        f.write("[[" + name + "\n")
        f.write(value + "\n]]\n\n")

        f.close()
Beispiel #6
0
    def _init_terms_table(self, filename, termtype):
        self.categorie = termtype
        # get file name

        self.fullname = filename
        self.filename = os.path.basename(filename).split('.')[0]
        # get file created date
        self._filetime = util.get_creation_file_date(filename)

        # init verfying tab
        self.verified_terms = {}
        self.keywords = dtn_sentence.get_document_categorie(termtype)
        for key in self.keywords:
            self.verified_terms[key] = ExtractData(key, termtype)

        dtn_logger.logger_info("VERIFY", "%s (%s)" % (filename, termtype))
Beispiel #7
0
    def _train(self, trainf, model, threads=8, cost=16.0):
        '''
        -a CRF-L2 or CRF-L1 默认是CRF-L2, 一般来说L2算法效果要比L1算法稍微好一点,虽然L1算法中非零特征的数值要比L2中大幅度的小。
        -c float 这个参数设置CRF的hyper-parameter。c的数值越大,CRF拟合训练数据的程度越高。
        -f NUM 这个参数设置特征的cut-off threshold。CRF++使用训练数据中至少NUM次出现的特征。默认值为1。
        -p NUM 多个CPU,那么那么可以通过多线程提升训练速度。NUM是线程数量。
        '''
        # run CRF++
        #os.system(self.crf_learn + " -t -p %d -c %f %s %s %s" % (threads, cost, self.templatef, trainf, model))
        templatef = crf_utils.get_crf_template_file()

        commd = crf_utils.CRF_LEARN + " -p %d -c %f %s %s %s" % (
            threads, cost, templatef, trainf, model)
        dtn_logger.logger_info("CRF Training", commd)

        #os.system(crf_utils.CRF_LEARN + " -p %d -c %f %s %s %s" % (threads, cost, templatef, trainf, model))
        os.system(commd)
Beispiel #8
0
    def _create_training_data(self):

        labels = to_categorical(np.asarray(self.labels))

        # create data
        data = pad_sequences(self.sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        dtn_logger.logger_info("TRAINING", "loading data " + str(data.shape))
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0])

        # training size and values
        x_train = data[:-nb_validation_samples]
        y_train = labels[:-nb_validation_samples]
        x_val = data[-nb_validation_samples:]
        y_val = labels[-nb_validation_samples:]

        return x_train, y_train, x_val, y_val
Beispiel #9
0
    def create_categorie_tagging(self, fpath, categorie):

        dataname = crf_utils.add_crf_model_name(categorie)

        dtn_logger.logger_info("CRF Training",
                               "%s => %s" % (categorie, dataname))

        out_path = crf_utils.get_crf_training_directory()

        output_file = os.path.join(out_path,
                                   dataname + crf_utils.CRF_FILE_TAG_EXT)

        output_data = codecs.open(output_file, 'w', 'utf-8')

        self.folder_tagging(fpath, categorie, output_data)

        output_data.close()

        model = os.path.join(out_path, dataname + crf_utils.CRF_MODEL_EXT)

        self._train(output_file, model)
Beispiel #10
0
    def train_word_vector_embedding(self, min_count=2, sg=0, vectfname=None):
        # word to vector file name
        if vectfname == None:
            vectfname = util.get_default_wordvect_file()

        # word to vector
        self._model = word2vec.Word2Vec(self.sentences,
                                        min_count=min_count,
                                        sg=sg,
                                        workers=4,
                                        size=self.VECTOR_LENGTH,
                                        window=5)

        # save wood to vector model
        self._model.wv.save_word2vec_format(vectfname, binary=False)

        # summarize vocabulary size in model
        words = list(self._model.wv.vocab)

        logging.logger_info(
            "Embedded",
            'create vocabulary : %d (%s)' % (len(words), vectfname))
Beispiel #11
0
    def close_output(self):

        if self._verbose > 0:
            dtn_logger.logger_info(
                "CONVERT", "Total Files : %d, Error Files : %d \n\n" %
                (self._total, self._error))
            dtn_logger.logger_info(
                "CONVERT",
                "DOC(%d), DOCX(%d),  PDF(%d), IMG(%d),  XLSX(%d), TXT(%d), MSG(%d)"
                % (self._ftypes['DOC'], self._ftypes['DOCX'],
                   self._ftypes['PDF'], self._ftypes['IMG'],
                   self._ftypes['XLSX'], self._ftypes['TXT'],
                   self._ftypes['MSG']))
            self.outout_result.close()

        if self._verbose > 1:
            print("\n\nTotal Files : %d, Error Files : %d \n\n" %
                  (self._total, self._error))
            print(
                "\n\nDOC(%d), DOCX(%d),  PDF(%d), IMG(%d), XLSX(%d), MSG(%d), MSG(%d) \n\n"
                % (self._ftypes['DOC'], self._ftypes['DOCX'],
                   self._ftypes['PDF'], self._ftypes['IMG'],
                   self._ftypes['XLSX'], self._ftypes['TXT'],
                   self._ftypes['MSG']))
Beispiel #12
0
    def write_text_file(self, path, outfile, document, f):

        #isok = f.test_document(document)
        isok = False
        if document and len(document) > 100:
            isok = True
        if isok:
            # write text file
            fp = codecs.open(outfile, 'w', self._encoding)
            fp.write(document)
            fp.close()
            dtn_logger.logger_info("CONVERT", "%s => %s" % (path, outfile))

            if self._verbose > 0:
                self.outout_result.write(path + " => OK \n")
                if self._verbose > 1:
                    print("%d write text to %s " % (self._total, outfile))
        else:
            dtn_logger.logger_error("CONVERT", "%s => %s" % (path, outfile))
            self._error += 1
            if self._verbose > 0:
                self.outout_result.write(path + " => ERROR\n")
                if self._verbose > 1:
                    print("%d. convert error : %s " % (self._error, path))
    def predict(self, filename):

        texts = self.load_predict_document(filename)

        tokenizer = Tokenizer(num_words=self.clause_model.MAX_NB_WORDS)
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)

        # create data
        data = pad_sequences(sequences,
                             maxlen=self.clause_model.MAX_SEQUENCE_LENGTH)

        dtn_logger.logger_info("PREDICT",
                               "Verification document : " + filename)
        dtn_logger.logger_info("PREDICT", "Predict Data : " + str(data.shape))

        model = md.load_json_model(self.clause_model.MODEL_NAME)
        #model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        model.compile(loss='binary_crossentropy',
                      optimizer=md.OPTIMIZER_ADAM,
                      metrics=['accuracy'])

        for i, s in enumerate(data):
            s = data[np.array([i])]
            preds = model.predict(s)

            n = self.sample(preds[0])
            print("*** " + self.clause_model.label_name[n] + "***")
            n = self.sample(preds[0], 0.8)
            print("*** " + self.clause_model.label_name[n] + "***")
            n = self.sample(preds[0], 0.2)
            print("*** " + self.clause_model.label_name[n] + "***")

            print(texts[i])
            if i > 5:
                break
Beispiel #14
0
    def create_dataset(self,
                       text_path,
                       min_count=2,
                       sg=0,
                       workers=1,
                       size=256,
                       window=5):
        """
            
        min_count : ignore all words with total frequency lower than this.
        sg : sg = O CBOW, sg=1 skip-gram 
        workers: thread
        size : dimension feature vectors.
        window : maximum distance between the current and predicted word within a sentence.
    
        """
        dtn_logger.logger_info("DATASET", "create dataset " + text_path)

        lists, doclists = self.load_documents(text_path)

        dictionary = corpora.Dictionary(lists)
        corpus = [dictionary.doc2bow(text) for text in lists]

        # save corpus
        corpusfname = self.get_mm_file_name()
        corpora.MmCorpus.serialize(corpusfname, corpus)

        # save dictionay
        dictfname = self.get_dict_file_name()
        dictionary.save(dictfname)

        dictfname = self.get_model_list_name()
        # initialize a model
        tfidf = models.TfidfModel(corpus, normalize=True)

        # use the model to transform vectors
        corpus_tfidf = tfidf[corpus]

        # initialize an LSI transformation, LSI 2-D space
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
        lsi.save(dictfname)  # same for tfidf, lda, ...

        #training doc2vec
        model = self.TrainingDoc2Vec(doclists,
                                     size=size,
                                     window=window,
                                     nb_loop=32)
        # save doc vector
        vectfname = self.get_model_file_name()
        model.save(vectfname)

        # word to vector
        model = word2vec.Word2Vec(lists,
                                  min_count=min_count,
                                  sg=sg,
                                  workers=workers,
                                  size=size,
                                  window=window)
        # save words vector
        vectfname = self.get_word_model_name()
        model.wv.save_word2vec_format(vectfname, binary=False)
        #model.sort_vocab()
        #model.build_vocab(sentences, update=False)

        # save file label
        self.save_filelabel()

        # save doc label
        self.save_doclabel()

        # save vector labels
        self.save_labelset()

        # save classifier labels
        self.save_classifierlabel()
Beispiel #15
0
    o_file = options.output_dir

    if options.input_dir == None:
        infile = variables.CORPUS_DIR
        if options.action != 'convert':
            infile = os.path.join(variables.CORPUS_DIR, 'TEXT')

    commd = "-a " + options.action
    if infile:
        commd += " -i " + infile
    if o_file:
        commd += " -o " + o_file
    if options.doc_type:
        commd += " -t " + options.doc_type

    dtn_logger.logger_info("MAIN", commd)

    if options.action == 'convert':
        '''
        conv = Convert(verbose=verbose, restart=options.restart)   
        o_file = conv.open_output(infile, o_file)
        conv.files_to_text(infile, o_file)    
        conv.close_output()  
        '''
        ofile = docutonelocate.convert_file(infile, True)

    elif options.action == 'testfile':

        conv = Convert(verbose=verbose, restart=options.restart)

        conv.test_files_in_directory(infile, o_file)