Esempio n. 1
0
 def load(self, mp):
     #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "load process started.")
     self.model = load_model(mp + '.h5')
     self.tokenizer = pickle.load(open(mp + '_tokenizer', 'rb'))
     self.labelCategorizer = LabelCategorizer()
     self.labelCategorizer.load('./' + self.get_model_name() +
                                '_label_map_relation.txt')
Esempio n. 2
0
class TextRNN:
    def __init__(self, params):
        #printlogInit()

        self.model_name = 'RNN'
        self.framework_name = 'keras'

        #w2v parameters
        self.default_w2v_fp = params.get('default_w2v_fp', None)
        self.w2v_fp = params.get('w2v_fp', 'default')

        if self.w2v_fp != None:
            if self.w2v_fp == 'default':
                self.w2v_fp = self.default_w2v_fp
            self.use_external_embedding = True
        else:
            self.use_external_embedding = False

        # layer parameters
        self.embedding_dim = int(params.get('embedding_dim', 300))
        self.use_external_embedding = bool(
            params.get('use_external_embedding', True))
        self.embedding_trainable = bool(params.get('embedding_trainable',
                                                   True))

        self.dropout_rate = float(params.get('dropout_rate', 0.5))

        self.rnn_cell_type = params.get('rnn_cell_type', 'LSTM')
        self.rnn_cell_size = int(params.get('rnn_cell_size', 256))

        self.dense_size = int(params.get('dense_size', 256))
        self.dense_activation = params.get('dense_activation', 'tanh')

        # model parameters
        self.model_loss = params.get('model_loss', 'categorical_crossentropy')
        self.model_optimizer = params.get('model_optimizer', 'Adadelta')
        self.model_metrics = [params.get('model_metrics', 'accuracy')]
        self.model_epoch = int(params.get('model_epoch', 5))
        self.model_train_batchsize = int(
            params.get('model_train_batchsize', 128))
        self.model_test_batchsize = int(
            params.get('model_test_batchsize', 1024))
        self.rnn_mode = params.get('rnn_mode', 'last_op_mode')

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "initialization finished.")

    def construct_graph(self, embedding_matrix=None):

        # set input layer
        input_layer = Input(shape=(self.max_sequence_length, ))

        # set embedding layer with pretrained embedding or not
        if self.use_external_embedding:
            assert embedding_matrix is not None
            embedding_layer = Embedding(
                self.vocab_size,
                self.embedding_dim,
                mask_zero=True,
                weights=[embedding_matrix],
                input_length=self.max_sequence_length,
                trainable=self.embedding_trainable)(input_layer)
        else:
            embedding_layer = Embedding(
                self.vocab_size,
                self.embedding_dim,
                input_length=self.max_sequence_length,
                trainable=self.embedding_trainable)(input_layer)

        bi_rnn_layer = Bidirectional(
            LSTM(self.rnn_cell_size, return_sequences=True))(embedding_layer)
        if self.rnn_cell_type == 'GRU':
            bi_rnn_layer = Bidirectional(
                GRU(self.rnn_cell_size,
                    return_sequences=True))(embedding_layer)
        elif self.rnn_cell_type == 'CuDNNLSTM':
            bi_rnn_layer = Bidirectional(
                CuDNNLSTM(self.rnn_cell_size,
                          return_sequences=True))(embedding_layer)
        elif self.rnn_cell_type == 'CuDNNGRU':
            bi_rnn_layer = Bidirectional(
                CuDNNGRU(self.rnn_cell_size,
                         return_sequences=True))(embedding_layer)

        # with param rnn_mode ,process bi-rnn layer's result
        if self.rnn_mode == 'last_op_mode':
            MyLastOp = Lambda(lambda x: x[:, -1, :])
            bi_rnn_layer = MyLastOp(bi_rnn_layer)
        elif self.rnn_mode == 'average_op_mode':
            MyAverageOp = Lambda(lambda x: mean(x, axis=1))
            bi_rnn_layer = MyAverageOp(bi_rnn_layer)

        #add dropout layer
        dropout_layer = Dropout(self.dropout_rate)(bi_rnn_layer)

        #add dense layer and output layer
        dense_layer = Dense(self.dense_size,
                            activation=self.dense_activation)(dropout_layer)
        output_layer = Dense(self.label_num, activation='softmax')(dense_layer)

        self.model = Model(inputs=input_layer, outputs=output_layer)

        #printlog(self.model.summary())

    def train(self, x, y, *, val_x=None, val_y=None):
        #printlog(self.get_framework_name + " " + self.get_model_name() + " " + "train process started")

        train_start_time = time.time()

        df_x = x
        df_y = y
        if val_x is not None and val_y is not None:
            df_x = x + val_x
            df_y = y + val_x

        #max_sequcen_length is depending on the length of most samples
        self.max_sequence_length = text_length_stat(df_x, 0.98)
        self.tokenizer = NNTokenPadding(
            params={'max_sequence_length': self.max_sequence_length},
            text_set=df_x)

        #transform text dataset into sequence
        df_x, word_index = self.tokenizer.extract(df_x)
        #transform label set into one-hot sequence
        self.labelCategorizer = LabelCategorizer()
        self.labelCategorizer.fit_on_labels(df_y)
        df_y = self.labelCategorizer.to_category(df_y)

        df_train = df_x[:len(x)]
        df_train_label = df_y[:len(y)]
        df_val = df_x[len(x):]
        df_val_label = df_y[len(y):]

        #label_num is depending on the samples
        self.label_num = df_y.shape[1]

        #vocab_size is depending on the word_index, which is the return of NNTolkenPadding().extract()
        self.vocab_size = len(word_index) + 1

        #get embedding_matrix from given w2v file
        embedding_matrix = None
        if self.use_external_embedding:
            assert self.w2v is not None
            params = {'w2v_fp': self.w2v_fp}
            embedding_matrix = WordEmbedding(params).extract(word_index)
            #此处的word_index是用来做迭代遍历
            self.embedding_dim = embedding_matrix.shape[1]

        #construct computation graph
        self.construct_graph(embedding_matrix)

        self.model.compile(loss=self.model_loss,
                           optimizer=self.model_optimizer,
                           metrics=self.model_metrics)

        # add callback function for progress exposure and early-stopping
        #es = EarlyStopping(monitor='val_loss', patience=0, verbose=1, mode='min')
        history = self.model.fit(df_train,
                                 df_train_label,
                                 validation_data=(df_val, df_val_label),
                                 epochs=self.model_epoch,
                                 verbose=1,
                                 batch_size=self.model_train_batchsize)
        #callbacks=[cb, es])

        self.train_report = history.history

        train_end_time = time.time()
        self.train_cost_time = train_end_time - train_start_time

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "train process finished.")

    def predict(self, x):
        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "predict process started.")

        df_text, _ = self.tokenizer.extract(x)
        preds = self.model.predict(df_text,
                                   batch_size=self.model_test_batchsize,
                                   verbose=1)
        preds = np.argmax(preds, axis=1)
        preds = self.labelCategorizer.label_re_transform(preds)

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "predict process finished.")
        return preds

    def score(self, x, y):
        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "score process started.")

        df_x, _ = self.tokenizer.extract(x)
        df_label = self.labelCategorizer.to_category(y)

        # scores[0] is loss, scores[1] is acc
        scores = self.model.evaluate(df_x, df_label, verbose=1)

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "score process finished.")
        return scores[1]

    def save(self, mp):
        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "save process started.")

        self.model.save(mp + '.h5')
        pickle.dump(self.tokenizer, open(mp + '_tokenizer', 'wb'))
        self.labelCategorizer.save('./' + self.get_model_name() +
                                   '_label_map_relation.txt')

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "save process finished.")

    def load(self, mp):
        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "load process started.")

        self.model = load_model(mp + '.h5')
        self.tokenizer = pickle.load(open(mp + '_tokenizer', 'rb'))
        self.labelCategorizer = LabelCategorizer()
        self.labelCategorizer.load('./' + self.get_model_name() +
                                   '_label_map_relation.txt')

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "load process finished.")

    def get_default_args(self):
        params = {
            'use_external_embedding': False,
            'embedding_dim': 300,
            'embedding_trainable': True,
            'dropout_rate': 0.5,
            'rnn_cell_type': 'LSTM',
            'rnn_cell_size': 256,
            'dense_size': 256,
            'dense_activation': 'tanh',
            'model_loss': 'categorical_crossentropy',
            'model_optimizer': 'Adadelta',
            'model_metrics': ['accuracy'],
            'model_epoch': 5,
            'model_train_batchsize': 128,
            'model_test_batchsize': 1024,
            'rnn_mode': 'last_op_mode'
        }
        return params

    def get_framework_name(self):
        return self.framework_name

    def get_model_name(self):
        return self.model_name

    def get_train_report(self):
        return self.train_cost_time, self.train_report
Esempio n. 3
0
    def train(self, x, y, *, val_x=None, val_y=None):
        #printlog(self.get_framework_name + " " + self.get_model_name() + " " + "train process started")

        train_start_time = time.time()

        df_x = x
        df_y = y
        if val_x is not None and val_y is not None:
            df_x = x + val_x
            df_y = y + val_x

        #max_sequcen_length is depending on the length of most samples
        self.max_sequence_length = text_length_stat(df_x, 0.98)
        self.tokenizer = NNTokenPadding(
            params={'max_sequence_length': self.max_sequence_length},
            text_set=df_x)

        #transform text dataset into sequence
        df_x, word_index = self.tokenizer.extract(df_x)
        #transform label set into one-hot sequence
        self.labelCategorizer = LabelCategorizer()
        self.labelCategorizer.fit_on_labels(df_y)
        df_y = self.labelCategorizer.to_category(df_y)

        df_train = df_x[:len(x)]
        df_train_label = df_y[:len(y)]
        df_val = df_x[len(x):]
        df_val_label = df_y[len(y):]

        #label_num is depending on the samples
        self.label_num = df_y.shape[1]

        #vocab_size is depending on the word_index, which is the return of NNTolkenPadding().extract()
        self.vocab_size = len(word_index) + 1

        #get embedding_matrix from given w2v file
        embedding_matrix = None
        if self.use_external_embedding:
            assert self.w2v is not None
            params = {'w2v_fp': self.w2v_fp}
            embedding_matrix = WordEmbedding(params).extract(word_index)
            #此处的word_index是用来做迭代遍历
            self.embedding_dim = embedding_matrix.shape[1]

        #construct computation graph
        self.construct_graph(embedding_matrix)

        self.model.compile(loss=self.model_loss,
                           optimizer=self.model_optimizer,
                           metrics=self.model_metrics)

        # add callback function for progress exposure and early-stopping
        #es = EarlyStopping(monitor='val_loss', patience=0, verbose=1, mode='min')
        history = self.model.fit(df_train,
                                 df_train_label,
                                 validation_data=(df_val, df_val_label),
                                 epochs=self.model_epoch,
                                 verbose=1,
                                 batch_size=self.model_train_batchsize)
        #callbacks=[cb, es])

        self.train_report = history.history

        train_end_time = time.time()
        self.train_cost_time = train_end_time - train_start_time
Esempio n. 4
0
class TextCNN:
    def __init__(self, params):
        #printlogInit()
        self.model_name = 'CNN'
        self.framework_name = 'keras'

        #w2v parameters
        self.default_w2v_fp = params.get('default_w2v_fp', None)
        self.w2v_fp = params.get('w2v_fp', 'default')

        if self.w2v_fp != None:  # if there exist a w2v
            if self.w2v_fp == 'default':
                self.w2v_fp = self.default_w2v_fp
            self.use_external_embedding = True
        else:
            self.use_external_embedding = False

        #layer parameters
        self.embedding_dim = int(params.get('embedding_dim', 300))
        self.use_external_embedding = bool(
            params.get('use_external_embedding', False))
        self.embedding_trainable = bool(params.get('embedding_trainable',
                                                   True))
        self.dropout_rate = float(params.get('dropout_rate', 0.5))
        self.filter_num = int(params.get('filter_num', 128))

        #self.filter_sizes = map(int, params.get('filter_size','3,4,5').split(','))
        filter_size = params.get('filter_size', '3,4,5').split(',')
        temp = []
        for i in filter_size:
            temp.append(int(i))
        self.filter_sizes = temp
        self.conv_activation = params.get('conv_activation', 'tanh')
        self.conv_strides = int(params.get('conv_strides', 1))
        self.conv_padding = params.get('conv_padding', 'valid')

        pooling_sizes = params.get('pooling_size', '3,4,5').split(',')
        temp = []
        for i in pooling_sizes:
            temp.append(int(i))
        self.pooing_sizes = temp

        pooling_strides = params.get('pooing_strides', '3,4,5').split(',')
        temp = []
        for i in pooling_strides:
            temp.append(int(i))
        self.pooling_strides = temp

        self.pooing_padding = params.get('pooing_padding', 'valid')
        self.dense_size = int(params.get('dense_size', 256))
        self.dense_activation = params.get('dense_activation', 'tanh')

        self.model_loss = params.get('model_loss', 'categorical_crossentropy')
        self.model_optimizer = params.get('model_optimizer', 'Adadelta')

        self.model_metrics = params.get('model_metrics', ['accuracy'])
        self.model_epoch = params.get('model_epoch', 5)
        self.model_train_batchsize = params.get('model_train_batchsize', 128)
        self.model_test_batchsize = params.get('model_test_batchsize', 1024)

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "initialization finished")

    def construct_graph(self, embedding_matrix=None):
        input_layer = Input(shape=(self.max_sequence_length, ))

        if self.use_external_embedding:
            assert embedding_matrix is not None
            embedding_layer = Embedding(
                self.vocab_size,
                self.embedding_dim,
                weights=[embedding_matrix],
                input_length=self.max_sequence_length,
                trainable=self.embedding_trainable)(input_layer)
        else:
            embedding_layer = Embedding(
                self.vocab_size,
                self.embedding_dim,
                input_length=self.max_sequence_length,
                trainable=self.embedding_trainable)(input_layer)

        # get conv-pool block for each filter size
        conv_blocks = []
        for i in range(len(self.filter_sizes)):
            conv = Conv1D(filters=self.filter_num,
                          kernel_size=self.filter_sizes[i],
                          strides=self.conv_strides,
                          padding=self.conv_padding,
                          activation=self.conv_activation)(embedding_layer)

            maxpool = MaxPooling1D(pool_size=self.pooing_sizes[i],
                                   strides=self.pooling_strides[i],
                                   padding=self.pooing_padding)(conv)

            flatten = Flatten()(maxpool)

            conv_blocks.append(flatten)

        #4 combine all flatten conv_pool feature
        concatente_layer = Concatenate()(conv_blocks)

        #5 add dorpout
        dropout_layer = Dropout(self.dropout_rate)(concatente_layer)

        #6 add dense layer and output layer
        dense_layer = Dense(units=self.dense_size,
                            activation=self.dense_activation)(dropout_layer)

        output_layer = Dense(units=self.label_num,
                             activation='softmax')(dense_layer)

        self.model = Model(inputs=input_layer, outputs=output_layer)

        #printlog(self.model.summary())

    def train(self, x, y, *, val_x=None, val_y=None):
        #printlog(self.get_framework_name() + " " + self.get_model_name())
        train_start_time = time.time()

        df_x = x
        df_y = y
        if val_x is not None and val_y is not None:
            df_x = x + val_x
            df_y = y + val_y

        # max_sequence_length is depending on the length of most samples. see txet_length_stat() method for details
        self.max_sequence_length = text_length_stat(df_x, 0.98)
        self.tokenizer = NNTokenPadding(
            params={'max_sequence_length': self.max_sequence_length},
            text_set=df_x)

        # transfer label set into one-hot sequence
        df_x, word_index = self.tokenizer.extract(df_x)
        # transform label sets into one-hot sequence
        self.labelCategorizer = LabelCategorizer()
        self.labelCategorizer.fit_on_labels(df_y)
        df_y = self.labelCategorizer.to_category(df_y)

        df_train = df_x[:len(x)]
        df_train_label = df_y[:len(y)]
        df_val = df_x[len(x):]
        df_val_label = df_y[len(y):]

        # label_num is depending on the samples
        self.label_num = df_y.shape[1]

        # vocab_size is depending on the word_index, which is the return of NNTokenPadding(),extract().
        self.vocab_size = len(word_index) + 1

        #get given embedding_matrix from given w2v file
        embedding_matrix = None
        if self.use_external_embedding:
            assert self.w2v_fp is not None
            params = {'w2v_fp': self.w2v_fp}
            embedding_matrix = WordEmbedding(params).extract(word_index)
            self.embedding_dim = embedding_matrix.shape[1]

        #contruct computation gragh
        self.construct_graph(embedding_matrix)
        self.model.compile(loss=self.model_loss,
                           optimizer=self.model_optimizer,
                           metrics=self.model_metrics)
        #add callback fuction for progress exposure and early-stopping
        #cb = ProgressExposure(int(len(df_train)/self.model_train_batchsize +1) * self.model_epoch, self.progress_fp)
        es = EarlyStopping(monitor='val_loss',
                           patience=0,
                           verbose=1,
                           mode='min')

        #start training
        history = self.model.fit(
            df_train,
            df_train_label,
            validation_data=(df_val, df_val_label),
            epochs=self.model_epoch,
            batch_size=self.model_train_batchsize,
            #callbacks=[cb, es],verbose = 1)
            callbacks=[es],
            verbose=1)

        self.train_report = history.history
        train_end_time = time.time()
        self.train_cost_time = train_end_time - train_start_time

        #printlog(self.getframework_name() + " " + self.get_model_name() + "" + "train process finished.")

    def predict(self, x):
        #printlog(self.get_framework_name() + "" + self.get_model_name() + "" + "predict process started")
        df_text, _ = self.tokenizer.extract(x)
        preds = self.model.predict(df_text,
                                   batch_size=self.model_test_batchsize,
                                   verbose=1)
        preds = np.argmax(preds, axis=1)
        preds = self.labelCategorizer.label_re_transform(preds)

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "predict process finished")
        return preds

    def score(self, x, y):
        #printlog(self.get_framwork_name() + "" + self.get_model_name() + "" + "score pocess started")
        df_x, _ = self.tokenizer.extract(x)
        df_label = self.labelCategorizer.to_category(y)

        #score[0] is loss, score[1] is acc
        scores = self.model.evaluate(df_x, df_label, verbose=1)

        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "score process finished.")
        return scores

    def save(self, mp):
        #printlog(self.get_frameworks_name() + " " + self.get_model_name() + " " + "save process started.")
        self.model.save(mp + '.h5')
        pickle.dump(self.tokenizer, open(mp + '_tokenizer', 'wb'))
        self.labelCategorizer.save("./" + self.get_model_name() +
                                   '_label_map_relation.txt')
        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "save process finished.")

    def load(self, mp):
        #printlog(self.get_framework_name() + " " + self.get_model_name() + " " + "load process started.")
        self.model = load_model(mp + '.h5')
        self.tokenizer = pickle.load(open(mp + '_tokenizer', 'rb'))
        self.labelCategorizer = LabelCategorizer()
        self.labelCategorizer.load('./' + self.get_model_name() +
                                   '_label_map_relation.txt')
        #printlog(self.get_framework_name() + " " + self.get_model_name() + " "  + "load process finished.")

    def get_default_args(self):
        params = {
            'use_external_embedding': False,
            'embedding_trainable': True,
            'embedding_dim': 300,
            'dropout_rate': 0.5,
            'filter_num': 128,
            'filter_size': [3, 4, 5],
            'conv_activation': 'tanh',
            'conv_strides': 1,
            'conv_padding': 'valid',
            'pooling_sizes': [3, 4, 5],
            'pooling_strides': [3, 4, 5],
            'pooling_padding': 'valid',
            'dense_size': 256,
            'dense_activation': 'tanh',
            'model_loss': 'categorical_crossentropy',
            'model_optimizer': 'Adadelta',
            'model_metrics': ['accuracy'],
            'model_epoch': 5,
            'model_train_batchsize': 128,
            'model_test_batchsize': 1024
        }
        return params

    def get_framework_name(self):
        return self.framework_name

    def get_model_name(self):
        return self.model_name

    def get_train_report(self):
        return self.train_cost_time, self.train_report