Example #1
0
 def data_generator(
     self,
     seq_type,
     x_train,
     x_valid,
     y_train,
     y_valid,
     x_len_train=None,
     x_len_valid=None,
 ):
     if seq_type == 'bucket':
         logger.info('use bucket sequence to speed up model training')
         train_batches = BucketIterator(self.task_type, self.transformer,
                                        x_len_train, x_train, y_train,
                                        self.nb_bucket, self.batch_size)
         valid_batches = BucketIterator(self.task_type, self.transformer,
                                        x_len_valid, x_valid, y_valid,
                                        self.nb_bucket, self.batch_size)
     elif seq_type == 'basic':
         train_batches = BasicIterator(self.task_type, self.transformer,
                                       x_train, y_train, self.batch_size)
         valid_batches = BasicIterator(self.task_type, self.transformer,
                                       x_valid, y_valid, self.batch_size)
     else:
         logger.warning(
             'invalid data iterator type, only supports "basic" or "bucket"'
         )
     return train_batches, valid_batches
Example #2
0
    def load_sl_data(self):
        """
        Reads a data file for text classification. The file should contain one document/text per line.
        The line should have the following formats:
        1. conll:
            word\ttag
            ...
            word\ttag

            word\ttag
            ...
        2. basic:
            word###tag\tword###tag\t...word###tag
        """
        data = (line.strip() for line in open(self.fname, 'r', encoding='utf8'))
        if self.data_format == 'basic':
            self.texts, self.labels = zip(
                *[zip(*[item.rsplit('###', 1) for item in line.split('\t')]) for line in data])
            self.texts = list(map(list, self.texts))
            self.labels = list(map(list, self.labels))
        elif self.data_format == 'conll':
            self.texts, self.labels = self.process_conll(data)
        else:
            logger.warning('invalid data format for sequence labeling task')
            sys.exit()
Example #3
0
 def load(self, weight_fname, para_fname):
     if self.model_name == 'word_rnn':
         self.model = Word_RNN.load(weight_fname, para_fname)
     elif self.model_name == 'char_rnn':
         self.model = Char_RNN.load(weight_fname, para_fname)
     elif self.model_name == 'idcnn':
         self.model = IDCNN.load(weight_fname, para_fname)
     else:
         logger.warning('invalid model name')
         sys.exit()
Example #4
0
 def get_model(self):
     if self.model_name == 'word_rnn':
         model = Word_RNN(
             nb_classes=self.config['nb_classes'],
             nb_tokens=self.config['nb_tokens'],
             nb_char_tokens=self.config['nb_char_tokens'],
             maxlen=self.config['maxlen'],
             embedding_dim=self.config['embedding_dim'],
             embeddings=self.config['token_embeddings'],
             inner_char=self.config['data']['inner_char'],
             use_crf=self.m_cfg['use_crf'],
             char_feature_method=self.m_cfg['char_feature_method'],
             integration_method=self.m_cfg['integration_method'],
             rnn_type=self.m_cfg['rnn_type'],
             nb_rnn_layers=self.m_cfg['nb_rnn_layers'],
             nb_filters=self.m_cfg['nb_filters'],
             conv_kernel_size=self.m_cfg['conv_kernel_size'],
             drop_rate=self.m_cfg['drop_rate'],
             re_drop_rate=self.m_cfg['re_drop_rate'],
             word_rnn_size=self.m_cfg['word_rnn_size'],
             embed_dropout_rate=self.m_cfg['embed_drop_rate'])
     elif self.model_name == 'char_rnn':
         model = Char_RNN(
             nb_classes=self.config['nb_classes'],
             nb_tokens=self.config['nb_tokens'],
             nb_seg_tokens=self.config['nb_seg_tokens'],
             nb_radical_tokens=self.config['nb_radical_tokens'],
             maxlen=self.config['maxlen'],
             embedding_dim=self.config['embedding_dim'],
             use_seg=self.config['use_seg'],
             use_radical=self.config['use_radical'],
             use_crf=self.m_cfg['use_crf'],
             rnn_type=self.m_cfg['rnn_type'],
             nb_rnn_layers=self.m_cfg['nb_rnn_layers'],
             drop_rate=self.m_cfg['drop_rate'],
             re_drop_rate=self.m_cfg['re_drop_rate'],
             char_rnn_size=self.m_cfg['char_rnn_size'],
             embed_dropout_rate=self.m_cfg['embed_drop_rate'])
     elif self.model_name == 'idcnn':
         model = IDCNN(nb_classes=self.config['nb_classes'],
                       nb_tokens=self.config['nb_tokens'],
                       maxlen=self.config['maxlen'],
                       embedding_dim=self.config['embedding_dim'],
                       embeddings=self.config['token_embeddings'],
                       use_crf=self.m_cfg['use_crf'],
                       nb_filters=self.m_cfg['nb_filters'],
                       conv_kernel_size=self.m_cfg['conv_kernel_size'],
                       drop_rate=self.m_cfg['drop_rate'],
                       repeat_times=self.m_cfg['repeat_times'],
                       dilation_rate=self.m_cfg['dilation_rate'],
                       embed_dropout_rate=self.m_cfg['embed_drop_rate'])
     else:
         logger.warning('The model name ' + self.model_name + ' is unknown')
         model = None
     return model
Example #5
0
 def load(self, weight_fname, para_fname):
     if self.model_name == 'bi_lstm_att':
         self.model = bi_lstm_attention.load(weight_fname, para_fname)
     elif self.model_name == 'transformer':
         self.model = Transformer.load(weight_fname, para_fname)
     elif self.model_name == 'text_cnn':
         self.model = textCNN.load(weight_fname, para_fname)
     elif self.model_name == 'dpcnn':
         self.model = DPCNN.load(weight_fname, para_fname)
     else:
         logger.warning('invalid model name')
         sys.exit()
Example #6
0
 def get_model(self):
     if self.model_name == 'bi_lstm_att':
         model = bi_lstm_attention(
             nb_classes=self.config['nb_classes'],
             nb_tokens=self.config['nb_tokens'],
             maxlen=self.config['maxlen'],
             embedding_dim=self.config['embedding_dim'],
             embeddings=self.config['token_embeddings'],
             rnn_size=self.m_cfg['rnn_size'],
             attention_dim=self.m_cfg['attention_dim'],
             final_dropout_rate=self.m_cfg['final_drop_rate'],
             embed_dropout_rate=self.m_cfg['embed_drop_rate'],
             return_attention=self.m_cfg['return_att'])
     elif self.model_name == 'transformer':
         model = Transformer(
             nb_classes=self.config['nb_classes'],
             nb_tokens=self.config['nb_tokens'],
             maxlen=self.config['maxlen'],
             embedding_dim=self.config['embedding_dim'],
             embeddings=self.config['token_embeddings'],
             pos_embed=self.m_cfg['pos_embed'],
             nb_transformer=self.m_cfg['nb_transformer'],
             final_dropout_rate=self.m_cfg['final_drop_rate'],
             embed_dropout_rate=self.m_cfg['embed_drop_rate'])
     elif self.model_name == 'text_cnn':
         model = textCNN(nb_classes=self.config['nb_classes'],
                         nb_tokens=self.config['nb_tokens'],
                         maxlen=self.config['maxlen'],
                         embedding_dim=self.config['embedding_dim'],
                         embeddings=self.config['token_embeddings'],
                         conv_kernel_size=self.m_cfg['conv_kernel_size'],
                         pool_size=self.m_cfg['pool_size'],
                         nb_filters=self.m_cfg['nb_filters'],
                         fc_size=self.m_cfg['fc_size'],
                         embed_dropout_rate=self.m_cfg['embed_drop_rate'])
     elif self.model_name == 'dpcnn':
         model = DPCNN(nb_classes=self.config['nb_classes'],
                       nb_tokens=self.config['nb_tokens'],
                       maxlen=self.config['maxlen'],
                       embedding_dim=self.config['embedding_dim'],
                       embeddings=self.config['token_embeddings'],
                       region_kernel_size=self.m_cfg['region_kernel_size'],
                       conv_kernel_size=self.m_cfg['conv_kernel_size'],
                       pool_size=self.m_cfg['pool_size'],
                       nb_filters=self.m_cfg['nb_filters'],
                       repeat_time=self.m_cfg['repeat_time'],
                       final_dropout_rate=self.m_cfg['final_drop_rate'],
                       embed_dropout_rate=self.m_cfg['embed_drop_rate'])
     else:
         logger.warning('The model name ' + self.model_name + ' is unknown')
         model = None
     return model
Example #7
0
 def __init__(self, model_name, dataset: Dataset, seq_type='bucket'):
     self.model_name = model_name
     self.dataset = dataset
     self.transformer = dataset.transformer
     if dataset.mode == 'train':
         self.config = self.dataset.config
         self.m_cfg = self.config['model'][self.model_name]
         self.seq_type = seq_type
         if seq_type == 'bucket':
             self.config['maxlen'] = None
         self.model = self.get_model()
         self.model_trainer = self.get_trainer()
     elif dataset.mode == 'predict' or dataset.mode == 'eval':
         pass
     else:
         logger.warning(
             'invalid mode name. Current only support "train" "eval" "predict"'
         )
Example #8
0
    def __init__(self, mode, fname='', tran_fname='',
                 config=None, task_type=None, data_format=''):
        self.mode = mode
        self.fname = fname
        self.inner_char = False
        self.use_seg = False
        self.use_radical = False
        self.radical_dict = None
        
        if data_format != '':
            self.data_format = data_format

        if config:
            self.basic_token = config['data']['basic_token']
        self.html_texts = re.compile(r'('+'|'.join(REGEX_STR)+')', re.UNICODE)

        if task_type:
            if mode == 'train' and config is None:
                logger.error('please specify the config file path')
                sys.exit()
            self.task_type = task_type
        else:
            try:
                self.task_type = re.findall(r'config_(\w+)\.yaml', config)[0]
            except:
                logger.error('please check your config filename')
                sys.exit()

        if mode == 'train':
            if 'data' in config:
                self.config = config
                self.data_config = config['data']
                self.embed_config = config['embed']
                if self.task_type == 'sequence':
                    self.data_format = self.data_config['format']
                if self.basic_token == 'word':
                    self.max_tokens = self.data_config['max_words']
                    self.inner_char = self.data_config['inner_char']
                elif self.basic_token == 'char':
                    self.max_tokens = self.data_config['max_chars']
                    if self.task_type == 'sequence_labeling':
                        self.use_seg = self.data_config['use_seg']
                        self.use_radical = self.data_config['use_radical']
                        if self.config['train']['metric'] not in ['f1_seq']:
                            self.config['train']['metric'] = 'f1_seq'
                            logger.warning('sequence labeling task currently only support f1_seq callback')
                    elif self.task_type == 'classification':
                        if self.config['train']['metric'] in ['f1_seq']:
                            self.config['train']['metric'] = 'f1'
                            logger.warning('text classification task not support f1_seq callback, changed to f1')
                else:
                    logger.error('invalid token type, only support word and char')
                    sys.exit()
            else:
                logger.error("please pass in the correct config dict")
                sys.exit()

            if self.basic_token == 'char':
                self.use_seg = config['data']['use_seg']
                self.use_radical = config['data']['use_radical']

            if self.use_radical:
                radical_file = Path(os.path.dirname(
                    os.path.realpath(__file__))) / 'data' / 'radical.txt'
                self.radical_dict = {line.split()[0]: line.split()[1].strip()
                                    for line in open(radical_file, encoding='utf8')}

            self.transformer = IndexTransformer(
                task_type=self.task_type,
                max_tokens=self.max_tokens,
                max_inner_chars=self.data_config['max_inner_chars'],
                use_inner_char=self.inner_char,
                use_seg=self.use_seg,
                use_radical=self.use_radical,
                radical_dict=self.radical_dict,
                basic_token=self.basic_token)

        elif mode != 'train':
            if len(tran_fname) > 0:
                logger.info('transformer loaded')
                self.transformer = IndexTransformer.load(tran_fname)
                self.basic_token = self.transformer.basic_token
                self.use_seg = self.transformer.use_seg
                self.use_radical = self.transformer.use_radical
                self.inner_char = self.transformer.use_inner_char
                self.max_tokens = self.transformer.max_tokens
            else:
                logger.error("please pass in the transformer's filepath")
                sys.exit()

        if fname:
            self.load_data()
            self.fit()
        else:
            self.texts = []
            self.labels = []