Ejemplo n.º 1
0
    def train(self, train_dataset, remaining_time_budget=None):
        """model training on train_dataset.It can be seen as metecontroller
        
        :param train_dataset: tuple, (x_train, y_train)
            x_train: list of str, input training sentences.
            y_train: A `numpy.ndarray` matrix of shape (sample_count, class_num).
                     here `sample_count` is the number of examples in this dataset as train
                     set and `class_num` is the same as the class_num in metadata. The
                     values should be binary.
        :param remaining_time_budget:
:        """
        if self.done_training:
            return

        if self.call_num == 0:
           self.data_generator = DataGenerator(train_dataset, self.metadata)

        x_train,y_train = self.data_generator.sample_dataset_from_metadataset()
        x_train,feature_mode = self.data_generator.dataset_preporocess(x_train)
        


        if self.call_num == 0 :
            #self.data_generator.dataset_postprocess(x_train,y_train,'svm')
            self.model_manager = ModelGenerator(self.data_generator.feature_mode,
                                                load_pretrain_emb=self.load_pretrain_emb,
                                                fasttext_embeddings_index=self.fasttext_embeddings_index)


        self.model_name = self.model_manager.model_pre_select(self.call_num)
        #self.svm_token = self.data_generator.svm_token
        self.data_generator.dataset_postprocess(x_train,y_train,self.model_name)
        if self.call_num <= 1:
            self.model = self.model_manager.build_model(self.model_name,self.data_generator.data_feature)

        #self.data_generator.dataset_postprocess()

        if self.model_name == 'svm':
            self.model.fit(self.data_generator.x_train, ohe2cat(self.data_generator.y_train))
            self.svm_token = self.data_generator.svm_token
            valid_auc = self._valid_auc(self.data_generator.valid_x, self.data_generator.valid_y, svm=True)
            self.valid_auc_svm = valid_auc
            print ("valid_auc_svm",self.valid_auc_svm)
            # self.svm = False
        else:
            callbacks = None
            history = self.model.fit(
                self.data_generator.x_train,
                self.data_generator.y_train,
                epochs=NUM_EPOCH,
                callbacks=callbacks,
                validation_split=VALID_RATIO,
                validation_data=(self.data_generator.valid_x,self.data_generator.valid_y),
                verbose=2,
                batch_size=self.batch_size,
                shuffle=True)

            self.feedback_simulation(history)
Ejemplo n.º 2
0
class Model(object):
    """ 
        model of CNN baseline without pretraining.
        see `https://aclweb.org/anthology/D14-1181` for more information.
    """
    def __init__(self, metadata, train_output_path="./", test_input_path="./"):
        """ Initialization for model
        :param metadata: a dict formed like:
            {"class_num": 10,
             "language": ZH,
             "num_train_instances": 10000,
             "num_test_instances": 1000,
             "time_budget": 300}
        """
        self.done_training = False
        self.metadata = metadata
        self.train_output_path = train_output_path
        self.test_input_path = test_input_path
        self.model = None
        self.call_num = 0
        self.load_pretrain_emb = True
        self.emb_size = EMBEDDING_SIZE
        self.batch_size = INIT_BATCH_SIZE
        self.total_call_num = TOTAL_CALL_NUM
        self.valid_cost_list = []
        self.auc = 0
        self.svm = True
        self.svm_model = None
        self.svm_token = None
        self.tokenizer = None
        self.model_weights_list = []
        # 0: char based   1: word based   2: doc based
        self.feature_mode = 1

        # "text_cnn" "lstm" "sep_cnn_model"
        self.model_mode = 'text_cnn'
        self.fasttext_embeddings_index = None

        # 0: binary_crossentropy
        # 1: categorical_crossentropy
        # 2: sparse_categorical_crossentropy
        self.metric = 1

        self.num_features = MAX_VOCAB_SIZE
        # load pretrian embeding
        if self.load_pretrain_emb:
            self._load_emb()

    def train(self, train_dataset, remaining_time_budget=None):
        """model training on train_dataset.It can be seen as metecontroller
        
        :param train_dataset: tuple, (x_train, y_train)
            x_train: list of str, input training sentences.
            y_train: A `numpy.ndarray` matrix of shape (sample_count, class_num).
                     here `sample_count` is the number of examples in this dataset as train
                     set and `class_num` is the same as the class_num in metadata. The
                     values should be binary.
        :param remaining_time_budget:
:        """
        if self.done_training:
            return

        if self.call_num == 0:
            self.data_generator = DataGenerator(train_dataset, self.metadata)

        x_train, y_train = self.data_generator.sample_dataset_from_metadataset(
        )
        x_train, feature_mode = self.data_generator.dataset_preporocess(
            x_train)

        if self.call_num == 0:
            #self.data_generator.dataset_postprocess(x_train,y_train,'svm')
            self.model_manager = ModelGenerator(
                self.data_generator.feature_mode,
                load_pretrain_emb=self.load_pretrain_emb,
                fasttext_embeddings_index=self.fasttext_embeddings_index)

        self.model_name = self.model_manager.model_pre_select(self.call_num)
        #self.svm_token = self.data_generator.svm_token
        self.data_generator.dataset_postprocess(x_train, y_train,
                                                self.model_name)
        if self.call_num <= 1:
            self.model = self.model_manager.build_model(
                self.model_name, self.data_generator.data_feature)

        #self.data_generator.dataset_postprocess()

        if self.model_name == 'svm':
            self.model.fit(self.data_generator.x_train,
                           ohe2cat(self.data_generator.y_train))
            self.svm_token = self.data_generator.svm_token
            valid_auc = self._valid_auc(self.data_generator.valid_x,
                                        self.data_generator.valid_y,
                                        svm=True)
            self.valid_auc_svm = valid_auc
            print("valid_auc_svm", self.valid_auc_svm)
            # self.svm = False
        else:
            callbacks = None
            history = self.model.fit(
                self.data_generator.x_train,
                self.data_generator.y_train,
                epochs=NUM_EPOCH,
                callbacks=callbacks,
                validation_split=VALID_RATIO,
                validation_data=(self.data_generator.valid_x,
                                 self.data_generator.valid_y),
                verbose=2,
                batch_size=self.batch_size,
                shuffle=True)

            self.feedback_simulation(history)

    def _get_valid_columns(self, solution):
        """Get a list of column indices for which the column has more than one class.
        This is necessary when computing BAC or AUC which involves true positive and
        true negative in the denominator. When some class is missing, these scores
        don't make sense (or you have to add an epsilon to remedy the situation).

        Args:
          solution: array, a matrix of binary entries, of shape
            (num_examples, num_features)
        Returns:
          valid_columns: a list of indices for which the column has more than one
            class.
        """
        num_examples = solution.shape[0]
        col_sum = np.sum(solution, axis=0)
        valid_columns = np.where(1 - np.isclose(col_sum, 0) -
                                 np.isclose(col_sum, num_examples))[0]
        return valid_columns

    def _autodl_auc(self, solution, prediction, valid_columns_only=True):
        """Compute normarlized Area under ROC curve (AUC).
        Return Gini index = 2*AUC-1 for  binary classification problems.
        Should work for a vector of binary 0/1 (or -1/1)"solution" and any discriminant values
        for the predictions. If solution and prediction are not vectors, the AUC
        of the columns of the matrices are computed and averaged (with no weight).
        The same for all classification problems (in fact it treats well only the
        binary and multilabel classification problems). When `valid_columns` is not
        `None`, only use a subset of columns for computing the score.
        """
        if valid_columns_only:
            valid_columns = self._get_valid_columns(solution)
            if len(valid_columns) < solution.shape[-1]:
                logger.warning(
                    "Some columns in solution have only one class, " +
                    "ignoring these columns for evaluation.")
            solution = solution[:, valid_columns].copy()
            prediction = prediction[:, valid_columns].copy()
        label_num = solution.shape[1]
        auc = np.empty(label_num)
        for k in range(label_num):
            r_ = tiedrank(prediction[:, k])
            s_ = solution[:, k]
            if sum(s_) == 0:
                print(
                    "WARNING: no positive class example in class {}".format(k +
                                                                            1))
            npos = sum(s_ == 1)
            nneg = sum(s_ < 1)
            auc[k] = (sum(r_[s_ == 1]) - npos * (npos + 1) / 2) / (nneg * npos)
        return 2 * mvmean(auc) - 1

    def _valid_auc(self, x_valid, y_valid, svm=False):

        if svm:
            x_valid = self.svm_token.transform(x_valid)

            result = self.model.predict_proba(x_valid)
        else:
            result = self.model.predict(x_valid)

        return self._autodl_auc(y_valid, result)  # y_test

    def test(self, x_test, remaining_time_budget=None):
        """
        :param x_test: list of str, input test sentences.
        :param remaining_time_budget:
        :return: A `numpy.ndarray` matrix of shape (sample_count, class_num).
                 here `sample_count` is the number of examples in this dataset as test
                 set and `class_num` is the same as the class_num in metadata. The
                 values should be binary or in the interval [0,1].
        """
        # model = models.load_model(self.test_input_path + 'model.h5')

        train_num, self.test_num = self.metadata['train_num'], self.metadata[
            'test_num']
        self.class_num = self.metadata['class_num']
        print("num_samples_test:", self.test_num)
        print("num_class_test:", self.class_num)

        #if self.call_num == 0 or self.call_num == 1:
        if self.call_num == 0:
            # tokenizing Chinese words
            if self.metadata['language'] == 'ZH':
                x_test = clean_zh_text(x_test)
                if self.data_generator.feature_mode == 1:
                    x_test = list(map(_tokenize_chinese_words, x_test))
            else:
                x_test = clean_en_text(x_test)
            self.x_test_clean = x_test

            x_test = self.svm_token.transform(x_test)
            result = self.model.predict_proba(x_test)
            self.svm_result = result
            self.call_num = self.call_num + 1
            return result  # y_test
        if self.call_num == 1:
            self.tokenizer = self.data_generator.tokenizer
            x_test = self.tokenizer.texts_to_sequences(self.x_test_clean)
            self.x_test = sequence.pad_sequences(
                x_test, maxlen=self.data_generator.data_feature['max_length'])

        if self.selcet_svm:
            result = self.svm_result
            print("load svm again!!!")
        else:
            result = self.model.predict(self.x_test,
                                        batch_size=self.batch_size * 16)

        # Cumulative training times
        self.call_num = self.call_num + 1
        if self.call_num >= self.total_call_num:
            self.done_training = True

        return result  # y_test

    def _load_emb(self):
        # loading pretrained embedding

        FT_DIR = '/app/embedding'
        fasttext_embeddings_index = {}
        if self.metadata['language'] == 'ZH':
            f = gzip.open(os.path.join(FT_DIR, 'cc.zh.300.vec.gz'), 'rb')
        elif self.metadata['language'] == 'EN':
            f = gzip.open(os.path.join(FT_DIR, 'cc.en.300.vec.gz'), 'rb')
        else:
            raise ValueError('Unexpected embedding path:'
                             ' {unexpected_embedding}. '.format(
                                 unexpected_embedding=FT_DIR))

        for line in f.readlines():
            values = line.strip().split()
            if self.metadata['language'] == 'ZH':
                word = values[0].decode('utf8')
            else:
                word = values[0].decode('utf8')
            coefs = np.asarray(values[1:], dtype='float32')
            fasttext_embeddings_index[word] = coefs

        print('Found %s fastText word vectors.' %
              len(fasttext_embeddings_index))
        self.fasttext_embeddings_index = fasttext_embeddings_index

        # embedding lookup
        #EMBEDDING_DIM = self.emb_size
        #self.embedding_matrix = np.zeros((self.num_features, EMBEDDING_DIM))
        #return self.embedding_matrix

    def feedback_simulation(self, history):
        # Model Selection and Sample num from Feedback Dynamic Regulation of Simulator
        # Dynamic sampling ,if accuracy is lower than 0.65 ,Increase sample size
        self.sample_num_per_class = self.data_generator.sample_num_per_class
        if history.history['acc'][0] < 0.65:
            self.sample_num_per_class = min(
                4 * self.data_generator.sample_num_per_class,
                self.data_generator.max_sample_num_per_class)
        #TODO self.sample_num_per_class
        self.data_generator.set_sample_num_per_class(self.sample_num_per_class)

        # Early stop and restore weight automatic
        valid_auc = self._valid_auc(self.data_generator.valid_x,
                                    self.data_generator.valid_y)
        print("valid_auc: ", valid_auc)

        # select which model is activated
        self.selcet_svm = self.valid_auc_svm > valid_auc

        early_stop_conditon2 = self.call_num >= 3 and (
            self.valid_cost_list[self.call_num - 2] -
            valid_auc) > 0 and (self.valid_cost_list[self.call_num - 3] -
                                self.valid_cost_list[self.call_num - 2]) > 0
        pre_auc = self.auc
        self.auc = valid_auc
        self.valid_cost_list.append(valid_auc)
        early_stop_conditon1 = self.auc < pre_auc and self.auc > 0.8
        if early_stop_conditon1 or early_stop_conditon2:
            self.done_training = True
            if early_stop_conditon2:
                self.model.set_weights(self.model_weights_list[self.call_num -
                                                               3])
                print("load weight...")
            if self.call_num >= 1 and early_stop_conditon1:
                self.model.set_weights(self.model_weights_list[self.call_num -
                                                               2])
                print("load weight...")

        #print(str(type(self.x_train)) + " " + str(y_train.shape))

        model_weights = self.model.get_weights()
        self.model_weights_list.append(model_weights)