def data_generator( self, seq_type, x_train, x_valid, y_train, y_valid, x_len_train=None, x_len_valid=None, ): if seq_type == 'bucket': logger.info('use bucket sequence to speed up model training') train_batches = BucketIterator(self.task_type, self.transformer, x_len_train, x_train, y_train, self.nb_bucket, self.batch_size) valid_batches = BucketIterator(self.task_type, self.transformer, x_len_valid, x_valid, y_valid, self.nb_bucket, self.batch_size) elif seq_type == 'basic': train_batches = BasicIterator(self.task_type, self.transformer, x_train, y_train, self.batch_size) valid_batches = BasicIterator(self.task_type, self.transformer, x_valid, y_valid, self.batch_size) else: logger.warning( 'invalid data iterator type, only supports "basic" or "bucket"' ) return train_batches, valid_batches
def load_sl_data(self): """ Reads a data file for text classification. The file should contain one document/text per line. The line should have the following formats: 1. conll: word\ttag ... word\ttag word\ttag ... 2. basic: word###tag\tword###tag\t...word###tag """ data = (line.strip() for line in open(self.fname, 'r', encoding='utf8')) if self.data_format == 'basic': self.texts, self.labels = zip( *[zip(*[item.rsplit('###', 1) for item in line.split('\t')]) for line in data]) self.texts = list(map(list, self.texts)) self.labels = list(map(list, self.labels)) elif self.data_format == 'conll': self.texts, self.labels = self.process_conll(data) else: logger.warning('invalid data format for sequence labeling task') sys.exit()
def load(self, weight_fname, para_fname): if self.model_name == 'word_rnn': self.model = Word_RNN.load(weight_fname, para_fname) elif self.model_name == 'char_rnn': self.model = Char_RNN.load(weight_fname, para_fname) elif self.model_name == 'idcnn': self.model = IDCNN.load(weight_fname, para_fname) else: logger.warning('invalid model name') sys.exit()
def get_model(self): if self.model_name == 'word_rnn': model = Word_RNN( nb_classes=self.config['nb_classes'], nb_tokens=self.config['nb_tokens'], nb_char_tokens=self.config['nb_char_tokens'], maxlen=self.config['maxlen'], embedding_dim=self.config['embedding_dim'], embeddings=self.config['token_embeddings'], inner_char=self.config['data']['inner_char'], use_crf=self.m_cfg['use_crf'], char_feature_method=self.m_cfg['char_feature_method'], integration_method=self.m_cfg['integration_method'], rnn_type=self.m_cfg['rnn_type'], nb_rnn_layers=self.m_cfg['nb_rnn_layers'], nb_filters=self.m_cfg['nb_filters'], conv_kernel_size=self.m_cfg['conv_kernel_size'], drop_rate=self.m_cfg['drop_rate'], re_drop_rate=self.m_cfg['re_drop_rate'], word_rnn_size=self.m_cfg['word_rnn_size'], embed_dropout_rate=self.m_cfg['embed_drop_rate']) elif self.model_name == 'char_rnn': model = Char_RNN( nb_classes=self.config['nb_classes'], nb_tokens=self.config['nb_tokens'], nb_seg_tokens=self.config['nb_seg_tokens'], nb_radical_tokens=self.config['nb_radical_tokens'], maxlen=self.config['maxlen'], embedding_dim=self.config['embedding_dim'], use_seg=self.config['use_seg'], use_radical=self.config['use_radical'], use_crf=self.m_cfg['use_crf'], rnn_type=self.m_cfg['rnn_type'], nb_rnn_layers=self.m_cfg['nb_rnn_layers'], drop_rate=self.m_cfg['drop_rate'], re_drop_rate=self.m_cfg['re_drop_rate'], char_rnn_size=self.m_cfg['char_rnn_size'], embed_dropout_rate=self.m_cfg['embed_drop_rate']) elif self.model_name == 'idcnn': model = IDCNN(nb_classes=self.config['nb_classes'], nb_tokens=self.config['nb_tokens'], maxlen=self.config['maxlen'], embedding_dim=self.config['embedding_dim'], embeddings=self.config['token_embeddings'], use_crf=self.m_cfg['use_crf'], nb_filters=self.m_cfg['nb_filters'], conv_kernel_size=self.m_cfg['conv_kernel_size'], drop_rate=self.m_cfg['drop_rate'], repeat_times=self.m_cfg['repeat_times'], dilation_rate=self.m_cfg['dilation_rate'], embed_dropout_rate=self.m_cfg['embed_drop_rate']) else: logger.warning('The model name ' + self.model_name + ' is unknown') model = None return model
def load(self, weight_fname, para_fname): if self.model_name == 'bi_lstm_att': self.model = bi_lstm_attention.load(weight_fname, para_fname) elif self.model_name == 'transformer': self.model = Transformer.load(weight_fname, para_fname) elif self.model_name == 'text_cnn': self.model = textCNN.load(weight_fname, para_fname) elif self.model_name == 'dpcnn': self.model = DPCNN.load(weight_fname, para_fname) else: logger.warning('invalid model name') sys.exit()
def get_model(self): if self.model_name == 'bi_lstm_att': model = bi_lstm_attention( nb_classes=self.config['nb_classes'], nb_tokens=self.config['nb_tokens'], maxlen=self.config['maxlen'], embedding_dim=self.config['embedding_dim'], embeddings=self.config['token_embeddings'], rnn_size=self.m_cfg['rnn_size'], attention_dim=self.m_cfg['attention_dim'], final_dropout_rate=self.m_cfg['final_drop_rate'], embed_dropout_rate=self.m_cfg['embed_drop_rate'], return_attention=self.m_cfg['return_att']) elif self.model_name == 'transformer': model = Transformer( nb_classes=self.config['nb_classes'], nb_tokens=self.config['nb_tokens'], maxlen=self.config['maxlen'], embedding_dim=self.config['embedding_dim'], embeddings=self.config['token_embeddings'], pos_embed=self.m_cfg['pos_embed'], nb_transformer=self.m_cfg['nb_transformer'], final_dropout_rate=self.m_cfg['final_drop_rate'], embed_dropout_rate=self.m_cfg['embed_drop_rate']) elif self.model_name == 'text_cnn': model = textCNN(nb_classes=self.config['nb_classes'], nb_tokens=self.config['nb_tokens'], maxlen=self.config['maxlen'], embedding_dim=self.config['embedding_dim'], embeddings=self.config['token_embeddings'], conv_kernel_size=self.m_cfg['conv_kernel_size'], pool_size=self.m_cfg['pool_size'], nb_filters=self.m_cfg['nb_filters'], fc_size=self.m_cfg['fc_size'], embed_dropout_rate=self.m_cfg['embed_drop_rate']) elif self.model_name == 'dpcnn': model = DPCNN(nb_classes=self.config['nb_classes'], nb_tokens=self.config['nb_tokens'], maxlen=self.config['maxlen'], embedding_dim=self.config['embedding_dim'], embeddings=self.config['token_embeddings'], region_kernel_size=self.m_cfg['region_kernel_size'], conv_kernel_size=self.m_cfg['conv_kernel_size'], pool_size=self.m_cfg['pool_size'], nb_filters=self.m_cfg['nb_filters'], repeat_time=self.m_cfg['repeat_time'], final_dropout_rate=self.m_cfg['final_drop_rate'], embed_dropout_rate=self.m_cfg['embed_drop_rate']) else: logger.warning('The model name ' + self.model_name + ' is unknown') model = None return model
def __init__(self, model_name, dataset: Dataset, seq_type='bucket'): self.model_name = model_name self.dataset = dataset self.transformer = dataset.transformer if dataset.mode == 'train': self.config = self.dataset.config self.m_cfg = self.config['model'][self.model_name] self.seq_type = seq_type if seq_type == 'bucket': self.config['maxlen'] = None self.model = self.get_model() self.model_trainer = self.get_trainer() elif dataset.mode == 'predict' or dataset.mode == 'eval': pass else: logger.warning( 'invalid mode name. Current only support "train" "eval" "predict"' )
def __init__(self, mode, fname='', tran_fname='', config=None, task_type=None, data_format=''): self.mode = mode self.fname = fname self.inner_char = False self.use_seg = False self.use_radical = False self.radical_dict = None if data_format != '': self.data_format = data_format if config: self.basic_token = config['data']['basic_token'] self.html_texts = re.compile(r'('+'|'.join(REGEX_STR)+')', re.UNICODE) if task_type: if mode == 'train' and config is None: logger.error('please specify the config file path') sys.exit() self.task_type = task_type else: try: self.task_type = re.findall(r'config_(\w+)\.yaml', config)[0] except: logger.error('please check your config filename') sys.exit() if mode == 'train': if 'data' in config: self.config = config self.data_config = config['data'] self.embed_config = config['embed'] if self.task_type == 'sequence': self.data_format = self.data_config['format'] if self.basic_token == 'word': self.max_tokens = self.data_config['max_words'] self.inner_char = self.data_config['inner_char'] elif self.basic_token == 'char': self.max_tokens = self.data_config['max_chars'] if self.task_type == 'sequence_labeling': self.use_seg = self.data_config['use_seg'] self.use_radical = self.data_config['use_radical'] if self.config['train']['metric'] not in ['f1_seq']: self.config['train']['metric'] = 'f1_seq' logger.warning('sequence labeling task currently only support f1_seq callback') elif self.task_type == 'classification': if self.config['train']['metric'] in ['f1_seq']: self.config['train']['metric'] = 'f1' logger.warning('text classification task not support f1_seq callback, changed to f1') else: logger.error('invalid token type, only support word and char') sys.exit() else: logger.error("please pass in the correct config dict") sys.exit() if self.basic_token == 'char': self.use_seg = config['data']['use_seg'] self.use_radical = config['data']['use_radical'] if self.use_radical: radical_file = Path(os.path.dirname( os.path.realpath(__file__))) / 'data' / 'radical.txt' self.radical_dict = {line.split()[0]: line.split()[1].strip() for line in open(radical_file, encoding='utf8')} self.transformer = IndexTransformer( task_type=self.task_type, max_tokens=self.max_tokens, max_inner_chars=self.data_config['max_inner_chars'], use_inner_char=self.inner_char, use_seg=self.use_seg, use_radical=self.use_radical, radical_dict=self.radical_dict, basic_token=self.basic_token) elif mode != 'train': if len(tran_fname) > 0: logger.info('transformer loaded') self.transformer = IndexTransformer.load(tran_fname) self.basic_token = self.transformer.basic_token self.use_seg = self.transformer.use_seg self.use_radical = self.transformer.use_radical self.inner_char = self.transformer.use_inner_char self.max_tokens = self.transformer.max_tokens else: logger.error("please pass in the transformer's filepath") sys.exit() if fname: self.load_data() self.fit() else: self.texts = [] self.labels = []