Beispiel #1
0
def init_dataset(opts, features, max_len, use_char_feature, max_len_char):
    """
    初始化数据参数

    Args:
        opts:
        features: list of int
        max_len: int
        use_char_feature: bool
        max_len_char: int

    Return:
        data_loader_test: DataLoader
    """
    root_idx = opts.root_idx
    path_num = os.path.join(root_idx, 'nums.txt')
    root_voc = opts.root_voc
    feature2id_dict = dict()
    for feature_i in features:
        path_f2id = os.path.join(root_voc,
                                 'feature_{0}_2id.pkl'.format(feature_i))
        feature2id_dict[feature_i] = read_pkl(path_f2id)
    label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl'))
    has_label = False
    batch_size = opts.batch_size
    use_cuda = opts.cuda
    num_worker = opts.nb_work
    path_result = opts.output

    # 初始化数据
    dataset = SentenceDataUtil(path_num,
                               root_idx,
                               max_len,
                               features,
                               feature2id_dict,
                               max_len_char=max_len_char,
                               use_char_feature=use_char_feature,
                               shuffle=False)
    dataset_test = dataset.get_all_data()
    data_loader_test = DataLoader(dataset_test,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=num_worker)

    return data_loader_test, label2id_dict
Beispiel #2
0
    def init_model_args(self):
        path_embed = os.path.join(self.opts.root_embed, 'word2vec.pkl')
        pretrained_embed = None
        if os.path.exists(path_embed):
            pretrained_embed = read_pkl(path_embed)
        feature_size_dict = dict()
        for feature_name in self.feature2id_dict:
            feature_size_dict[feature_name] = len(
                self.feature2id_dict[feature_name]) + 1
        feature_dim_dict = dict()
        for i, feature_name in enumerate(self.features):
            if i < len(self.opts.feature_dim):
                feature_dim_dict[feature_name] = self.opts.feature_dim[i]
            else:
                feature_dim_dict[feature_name] = 32  # default value 32
        if pretrained_embed is not None:  # 以预训练向量维度为准
            feature_dim_dict[str(
                self.features[0])] = pretrained_embed.shape[-1]
        dropout_rate = self.opts.dropout
        dim_char = self.opts.dim_char
        filter_sizes = self.opts.filter_sizes
        filter_nums = self.opts.filter_nums
        self.use_cuda = self.opts.cuda
        self.device_ids = self.opts.device_ids
        self.multi_gpu = True if len(self.opts.device_ids) > 1 else False
        assert self.opts.requires_grad in ('False', 'True')
        requires_grad = True if self.opts.requires_grad == 'True' else False

        self.model_kwargs = {
            'features': self.features,
            'lstm_units': self.opts.lstm_units,
            'layer_nums': self.opts.layer_nums,
            'feature_size_dict': feature_size_dict,
            'feature_dim_dict': feature_dim_dict,
            'pretrained_embed': pretrained_embed,
            'dropout_rate': dropout_rate,
            'max_len': self.max_len,
            'use_cuda': self.use_cuda,
            'use_char_feature': self.use_char_feature,
            'char_binary': self.char_binary,
            'dim_char': dim_char,
            'filter_sizes': filter_sizes,
            'filter_nums': filter_nums,
            'requires_grad': requires_grad,
            'max_len_char': self.max_len_char,
            'use_crf': self.opts.use_crf,
            'learn_rate': self.opts.learn_rate
        }

        self.learn_rate = self.opts.learn_rate
        self.nb_epoch = self.opts.nb_epoch
        self.max_patience = self.opts.max_patience
        self.root_model = self.opts.root_model
        self.use_crf = self.opts.use_crf
Beispiel #3
0
 def init_data_args(self):
     self.seed = self.opts.seed
     self.root_idx_train = self.opts.root_idx_train
     self.root_idx_dev = self.opts.root_idx_dev
     self.path_num_train = os.path.join(self.opts.root_idx_train,
                                        'nums.txt')
     self.max_len = self.opts.max_len
     self.root_voc = self.opts.root_voc
     self.features = self.opts.features
     self.feature2id_dict = dict()
     for feature_i in self.opts.features:
         path_f2id = os.path.join(self.root_voc,
                                  'feature_{0}_2id.pkl'.format(feature_i))
         self.feature2id_dict[feature_i] = read_pkl(path_f2id)
     self.label2id_dict = read_pkl(
         os.path.join(self.root_voc, 'label2id.pkl'))
     self.feature2id_dict['label'] = self.label2id_dict
     self.has_label = True
     self.dev_size = self.opts.dev_size
     self.batch_size = self.opts.batch_size
     self.num_worker = self.opts.nb_work
     self.use_char_feature = self.opts.use_char_feature
     self.char_binary = self.opts.char_binary
     self.max_len_char = self.opts.max_len_char
Beispiel #4
0
              default=False,
              help='是否使用GPU加速')
op.add_option('--nw', dest='nb_work', default=8, type='int', help='加载数据的线程数')
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)

# 初始化数据参数
root_idx = opts.root_idx
path_num = os.path.join(root_idx, 'nums.txt')
max_len = opts.max_len
root_voc = opts.root_voc
features = opts.features
feature2id_dict = dict()
for feature_i in opts.features:
    path_f2id = os.path.join(root_voc, 'feature_{0}_2id.pkl'.format(feature_i))
    feature2id_dict[feature_i] = read_pkl(path_f2id)
label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl'))
feature2id_dict['label'] = label2id_dict
has_label = True
dev_size = opts.dev_size
batch_size = opts.batch_size
num_worker = opts.nb_work

# 初始化数据
dataset = SentenceDataUtil(path_num,
                           root_idx,
                           max_len,
                           features,
                           feature2id_dict,
                           shuffle=False)
dataset_train, dataset_dev = dataset.split_train_and_dev(dev_size=dev_size)
Beispiel #5
0
# 加载模型
path_model = opts.path_model
sl_model = torch.load(path_model)
sl_model.set_use_cuda(opts.cuda)
if opts.max_len:
    sl_model.max_len = opts.max_len

# 初始化数据参数
root_idx = opts.root_idx
path_num = os.path.join(root_idx, 'nums.txt')
root_voc = opts.root_voc
feature2id_dict = dict()
for feature_i in sl_model.features:
    path_f2id = os.path.join(root_voc, 'feature_{0}_2id.pkl'.format(feature_i))
    feature2id_dict[feature_i] = read_pkl(path_f2id)
label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl'))
has_label = False
batch_size = opts.batch_size
use_cuda = opts.cuda
num_worker = opts.nb_work
path_result = opts.output

t0 = time()

# 初始化数据
dataset = SentenceDataUtil(
    path_num, root_idx, sl_model.max_len, sl_model.features, feature2id_dict, shuffle=False)
dataset_test = dataset.get_all_data()
data_loader_test = DataLoader(
    dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_worker)