def bb_to_segments(filename): ds = Dataset.from_record_file(filename).map(_parse_gz) iterable = iter(ds) def gen(): buf, size = [], 0 iterator = iter(ds) while 1: doc, doc_seg = next(iterator) for line, line_seg in zip(doc, doc_seg): #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result if len(line) == 0: continue line = np.array( line ) # 0.1 means large variance on sentence piece result line_seg = np.array(line_seg) size += len(line) buf.append(np.stack([line, line_seg]).transpose()) if size > max_input_seqlen: yield buf, buf, size = [], 0 if len(buf) != 0: yield buf, buf, size = [], 0 return Dataset.from_generator_func(gen)
def train(config): # Build Train Data data = TrainData(config.graph_work_path) train_iter = BatchGraphGenerator(graph_wrappers=[1], batch_size=config.batch_size, data=data, samples=config.samples, num_workers=config.sample_workers, feed_name_list=None, use_pyreader=False, phase="train", graph_data_path=config.graph_work_path, shuffle=True, neg_type=config.neg_type) train_ds = Dataset.from_generator_func(train_iter).repeat(config.epochs) dev_ds = Dataset.from_generator_func(train_iter) ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained( config.ernie_name) if "warm_start_from" not in config: warm_start_from = ernie_param_path else: ernie_param_path = config.ernie_param_path if "ernie_config" not in config: config.ernie_config = ernie_cfg_dict ws = propeller.WarmStartSetting(predicate_fn=lambda v: os.path.exists( os.path.join(warm_start_from, v.name)), from_dir=warm_start_from) train_ds.name = "train" train_ds.data_shapes = [[-1] + list(shape[1:]) for shape in train_ds.data_shapes] dev_ds.name = "dev" dev_ds.data_shapes = [[-1] + list(shape[1:]) for shape in dev_ds.data_shapes] tokenizer = load_tokenizer(config.ernie_name) config.cls_id = tokenizer.cls_id propeller.train.train_and_eval( model_class_or_model_fn=ERNIESageLinkPredictModel, params=config, run_config=config, train_dataset=train_ds, eval_dataset={"eval": dev_ds}, warm_start_setting=ws, )
def infer(args): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type test_ds = MolDataset(args, raw_dataset, mode="test") fn = MgfCollateFn(args, mode="test") test_loader = Dataloader(test_ds, batch_size=args.batch_size, num_workers=1, collate_fn=fn) test_loader = PDataset.from_generator_func(test_loader) est = propeller.Learner(MgfModel, args, args.model_config) mgf_list = [] for soft_mgf in est.predict(test_loader, ckpt_path=args.model_path_for_infer, split_batch=True): mgf_list.append(soft_mgf) mgf = np.concatenate(mgf_list) log.info("saving features") np.save( "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")), mgf)
def interleave(ds1, ds2): def gen(): for i, j in six.moves.zip_longest(iter(ds1), iter(ds2)): if i is not None: yield i if j is not None: yield j return Dataset.from_generator_func(gen)
def predict(config): # Build Train Data trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) num_nodes = int( np.load(os.path.join(config.graph_work_path, "num_nodes.npy"))) data = PredictData(num_nodes) predict_iter = BatchGraphGenerator(graph_wrappers=[1], batch_size=config.infer_batch_size, data=data, samples=config.samples, num_workers=config.sample_workers, feed_name_list=None, use_pyreader=False, phase="predict", graph_data_path=config.graph_work_path, shuffle=False, neg_type=config.neg_type) predict_ds = Dataset.from_generator_func(predict_iter) predict_ds.name = "predict" predict_ds.data_shapes = [[-1] + list(shape[1:]) for shape in predict_ds.data_shapes] tokenizer = load_tokenizer(config.ernie_name) config.cls_id = tokenizer.cls_id ernie_cfg_dict, ernie_param_path = PretrainedModelLoader.from_pretrained( config.ernie_name) config.ernie_config = ernie_cfg_dict est = propeller.Learner(ERNIESageLinkPredictModel, config, config) id2str = io.open(os.path.join(config.graph_work_path, "terms.txt"), encoding=config.encoding).readlines() fout = io.open("%s/part-%s" % (config.model_dir, trainer_id), "w", encoding="utf8") if "infer_model" in config: predict_result_iter = est.predict(predict_ds, ckpt_path=config["infer_model"]) else: predict_result_iter = est.predict(predict_ds, ckpt=-1) for user_feat, user_real_index in predict_result_iter: sri = id2str[int(user_real_index)].strip("\n") line = "{}\t{}\n".format(sri, tostr(user_feat)) fout.write(line) fout.close()
def make_pretrain_dataset(name, dir, vocab, hparams, args): gz_files = glob(dir) if not gz_files: raise ValueError('train data not found in %s' % dir) log.info('read from %s' % '\n'.join(gz_files)) max_input_seqlen = args.max_seqlen max_pretrain_seqlen = lambda: max_input_seqlen if r.random( ) > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate def _parse_gz(record_str): # function that takes python_str as input ex = propeller.data.example_pb2.SequenceExample() ex.ParseFromString(record_str) doc = [ np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['txt'].feature ] doc_seg = [ np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['segs'].feature ] return doc, doc_seg def bb_to_segments(filename): ds = Dataset.from_record_file(filename).map(_parse_gz) iterable = iter(ds) def gen(): buf, size = [], 0 iterator = iter(ds) while 1: doc, doc_seg = next(iterator) for line, line_seg in zip(doc, doc_seg): #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result if len(line) == 0: continue line = np.array( line ) # 0.1 means large variance on sentence piece result line_seg = np.array(line_seg) size += len(line) buf.append(np.stack([line, line_seg]).transpose()) if size > max_input_seqlen: yield buf, buf, size = [], 0 if len(buf) != 0: yield buf, buf, size = [], 0 return Dataset.from_generator_func(gen) def sample_negative(dataset): def gen(): iterator = iter(dataset) while True: chunk_a, = next(iterator) #chunk_b, = next(iterator) seqlen = max_pretrain_seqlen() seqlen_a = r.randint(1, seqlen) seqlen_b = seqlen - seqlen_a len_a = list(accumulate([len(c) for c in chunk_a])) buf_a = [c for c, l in zip(chunk_a, len_a) if l < seqlen_a] #always take the first one buf_b = [ c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen ] if r.random() < 0.5: #pos or neg label = np.int64(1) else: label = np.int64(0) buf_a, buf_b = buf_b, buf_a if not (len(buf_a) and len(buf_b)): continue a = np.concatenate(buf_a) b = np.concatenate(buf_b) #log.debug(a) #log.debug(b) sample, seg_info, token_type = build_pair( a, b, args.max_seqlen, vocab) #negative sample might exceed max seqlen yield sample, seg_info, token_type, label ds = propeller.data.Dataset.from_generator_func(gen) return ds def after(sentence, seg_info, segments, label): batch_size, seqlen = sentence.shape sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, args.mask_rate, hparams.vocab_size, vocab) ra = r.random() if ra < args.check: print('***') print('\n'.join([ str(j) + '\t' + '|'.join(map(str, i)) for i, j in zip(sentence.tolist(), label) ])) print('***') print('\n'.join(['|'.join(map(str, i)) for i in seg_info.tolist()])) print('***') print('|'.join(map(str, mlm_label.tolist()))) print('***') return sentence, segments, mlm_label, mask_pos, label # pretrain pipeline dataset = Dataset.from_list(gz_files) if propeller.train.distribution.status.mode == propeller.train.distribution.DistributionMode.NCCL: log.info('Apply sharding in distribution env') dataset = dataset.shard( propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id) dataset = dataset.repeat().shuffle(buffer_size=len(gz_files)) dataset = dataset.interleave(map_fn=bb_to_segments, cycle_length=len(gz_files), block_length=1) dataset = dataset.shuffle( buffer_size=1000) #must shuffle to ensure negative sample randomness dataset = sample_negative(dataset) dataset = dataset.padded_batch(hparams.batch_size, (0, 0, 0, 0)).map(after) dataset.name = name return dataset
def train(args, pretrained_model_config=None): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type train_ds = MolDataset(args, raw_dataset) args.eval_steps = math.ceil(len(train_ds) / args.batch_size) log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps)) fn = MgfCollateFn(args) train_loader = Dataloader(train_ds, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=args.shuffle, stream_shuffle_size=args.shuffle_size, collate_fn=fn) # for evaluating eval_train_loader = train_loader eval_train_loader = PDataset.from_generator_func(eval_train_loader) train_loader = multi_epoch_dataloader(train_loader, args.epochs) train_loader = PDataset.from_generator_func(train_loader) if args.warm_start_from is not None: # warm start setting def _fn(v): if not isinstance(v, F.framework.Parameter): return False if os.path.exists(os.path.join(args.warm_start_from, v.name)): return True else: return False ws = propeller.WarmStartSetting(predicate_fn=_fn, from_dir=args.warm_start_from) else: ws = None def cmp_fn(old, new): if old['eval'][args.metrics] - new['eval'][args.metrics] > 0: log.info("best %s eval result: %s" % (args.metrics, new['eval'])) return True else: return False if args.log_id is not None: save_best_model = int(args.log_id) == 5 else: save_best_model = True best_exporter = propeller.exporter.BestResultExporter( args.output_dir, (cmp_fn, save_best_model)) eval_datasets = {"eval": eval_train_loader} propeller.train.train_and_eval( model_class_or_model_fn=MgfModel, params=pretrained_model_config, run_config=args, train_dataset=train_loader, eval_dataset=eval_datasets, warm_start_setting=ws, exporters=[best_exporter], )