def init_dataset(opts, features, max_len, use_char_feature, max_len_char): """ 初始化数据参数 Args: opts: features: list of int max_len: int use_char_feature: bool max_len_char: int Return: data_loader_test: DataLoader """ root_idx = opts.root_idx path_num = os.path.join(root_idx, 'nums.txt') root_voc = opts.root_voc feature2id_dict = dict() for feature_i in features: path_f2id = os.path.join(root_voc, 'feature_{0}_2id.pkl'.format(feature_i)) feature2id_dict[feature_i] = read_pkl(path_f2id) label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl')) has_label = False batch_size = opts.batch_size use_cuda = opts.cuda num_worker = opts.nb_work path_result = opts.output # 初始化数据 dataset = SentenceDataUtil(path_num, root_idx, max_len, features, feature2id_dict, max_len_char=max_len_char, use_char_feature=use_char_feature, shuffle=False) dataset_test = dataset.get_all_data() data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_worker) return data_loader_test, label2id_dict
def init_model_args(self): path_embed = os.path.join(self.opts.root_embed, 'word2vec.pkl') pretrained_embed = None if os.path.exists(path_embed): pretrained_embed = read_pkl(path_embed) feature_size_dict = dict() for feature_name in self.feature2id_dict: feature_size_dict[feature_name] = len( self.feature2id_dict[feature_name]) + 1 feature_dim_dict = dict() for i, feature_name in enumerate(self.features): if i < len(self.opts.feature_dim): feature_dim_dict[feature_name] = self.opts.feature_dim[i] else: feature_dim_dict[feature_name] = 32 # default value 32 if pretrained_embed is not None: # 以预训练向量维度为准 feature_dim_dict[str( self.features[0])] = pretrained_embed.shape[-1] dropout_rate = self.opts.dropout dim_char = self.opts.dim_char filter_sizes = self.opts.filter_sizes filter_nums = self.opts.filter_nums self.use_cuda = self.opts.cuda self.device_ids = self.opts.device_ids self.multi_gpu = True if len(self.opts.device_ids) > 1 else False assert self.opts.requires_grad in ('False', 'True') requires_grad = True if self.opts.requires_grad == 'True' else False self.model_kwargs = { 'features': self.features, 'lstm_units': self.opts.lstm_units, 'layer_nums': self.opts.layer_nums, 'feature_size_dict': feature_size_dict, 'feature_dim_dict': feature_dim_dict, 'pretrained_embed': pretrained_embed, 'dropout_rate': dropout_rate, 'max_len': self.max_len, 'use_cuda': self.use_cuda, 'use_char_feature': self.use_char_feature, 'char_binary': self.char_binary, 'dim_char': dim_char, 'filter_sizes': filter_sizes, 'filter_nums': filter_nums, 'requires_grad': requires_grad, 'max_len_char': self.max_len_char, 'use_crf': self.opts.use_crf, 'learn_rate': self.opts.learn_rate } self.learn_rate = self.opts.learn_rate self.nb_epoch = self.opts.nb_epoch self.max_patience = self.opts.max_patience self.root_model = self.opts.root_model self.use_crf = self.opts.use_crf
def init_data_args(self): self.seed = self.opts.seed self.root_idx_train = self.opts.root_idx_train self.root_idx_dev = self.opts.root_idx_dev self.path_num_train = os.path.join(self.opts.root_idx_train, 'nums.txt') self.max_len = self.opts.max_len self.root_voc = self.opts.root_voc self.features = self.opts.features self.feature2id_dict = dict() for feature_i in self.opts.features: path_f2id = os.path.join(self.root_voc, 'feature_{0}_2id.pkl'.format(feature_i)) self.feature2id_dict[feature_i] = read_pkl(path_f2id) self.label2id_dict = read_pkl( os.path.join(self.root_voc, 'label2id.pkl')) self.feature2id_dict['label'] = self.label2id_dict self.has_label = True self.dev_size = self.opts.dev_size self.batch_size = self.opts.batch_size self.num_worker = self.opts.nb_work self.use_char_feature = self.opts.use_char_feature self.char_binary = self.opts.char_binary self.max_len_char = self.opts.max_len_char
default=False, help='是否使用GPU加速') op.add_option('--nw', dest='nb_work', default=8, type='int', help='加载数据的线程数') argv = [] if is_interactive() else sys.argv[1:] (opts, args) = op.parse_args(argv) # 初始化数据参数 root_idx = opts.root_idx path_num = os.path.join(root_idx, 'nums.txt') max_len = opts.max_len root_voc = opts.root_voc features = opts.features feature2id_dict = dict() for feature_i in opts.features: path_f2id = os.path.join(root_voc, 'feature_{0}_2id.pkl'.format(feature_i)) feature2id_dict[feature_i] = read_pkl(path_f2id) label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl')) feature2id_dict['label'] = label2id_dict has_label = True dev_size = opts.dev_size batch_size = opts.batch_size num_worker = opts.nb_work # 初始化数据 dataset = SentenceDataUtil(path_num, root_idx, max_len, features, feature2id_dict, shuffle=False) dataset_train, dataset_dev = dataset.split_train_and_dev(dev_size=dev_size)
# 加载模型 path_model = opts.path_model sl_model = torch.load(path_model) sl_model.set_use_cuda(opts.cuda) if opts.max_len: sl_model.max_len = opts.max_len # 初始化数据参数 root_idx = opts.root_idx path_num = os.path.join(root_idx, 'nums.txt') root_voc = opts.root_voc feature2id_dict = dict() for feature_i in sl_model.features: path_f2id = os.path.join(root_voc, 'feature_{0}_2id.pkl'.format(feature_i)) feature2id_dict[feature_i] = read_pkl(path_f2id) label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl')) has_label = False batch_size = opts.batch_size use_cuda = opts.cuda num_worker = opts.nb_work path_result = opts.output t0 = time() # 初始化数据 dataset = SentenceDataUtil( path_num, root_idx, sl_model.max_len, sl_model.features, feature2id_dict, shuffle=False) dataset_test = dataset.get_all_data() data_loader_test = DataLoader( dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_worker)