Beispiel #1
0
def main(args):
    """main"""
    paddle.enable_static()

    model_config = json.load(open(args.model_config, 'r'))

    exe_params = default_exe_params(False, args.use_cuda, args.thread_num)
    exe = exe_params['exe']
    gpu_id = exe_params['gpu_id']
    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
    else:
        place = fluid.CPUPlace()

    task = model_config['task']

    model = TAPEModel(model_config=model_config, name=task)

    test_program = fluid.Program()
    test_startup = fluid.Program()
    with fluid.program_guard(test_program, test_startup):
        with fluid.unique_name.guard():
            model.forward(True)
            exe.run(test_startup)

    if not args.init_model is None and args.init_model != "":
        load_partial_params(exe, args.init_model, test_program)
    else:
        raise RuntimeError('Please set init_model.')

    tokenizer = ProteinTokenizer()
    test_fetch_list = model.get_fetch_list(is_inference=True)

    examples = []
    for line in sys.stdin:
        if len(line.strip()) == 0:
            continue
        examples.append(line.strip())

    for i in range(0, len(examples), args.batch_size):
        inputs = gen_batch_data(examples[i: min(len(examples), i + args.batch_size)], tokenizer, place)
        results = exe.run(
                program=test_program,
                feed=inputs,
                fetch_list=test_fetch_list,
                return_numpy=False)
        pred = np.array(results[0])
        print(pred)
        show_results(examples, pred, task)
Beispiel #2
0
def main(args):
    """main"""
    paddle.enable_static()

    model_config = json.load(open(args.model_config, 'r'))

    exe_params = default_exe_params(False, args.use_cuda, args.thread_num)
    exe = exe_params['exe']
    trainer_num = exe_params['trainer_num']
    trainer_id = exe_params['trainer_id']
    places = exe_params['places']

    task = model_config['task']

    model = TAPEModel(model_config=model_config, name=task)

    test_program = fluid.Program()
    test_startup = fluid.Program()
    with fluid.program_guard(test_program, test_startup):
        with fluid.unique_name.guard():
            model.forward(True)
            model.cal_loss()
            test_data_loader = setup_data_loader(model.input_list,
                                                 model_config, args.test_data,
                                                 trainer_id, trainer_num,
                                                 places, args.batch_size)
            exe.run(test_startup)
    test_metric = get_metric(task)

    if not args.init_model is None and args.init_model != "":
        load_partial_params(exe, args.init_model, test_program)
    else:
        raise RuntimeError('Please set init_model.')

    test_fetch_list = model.get_fetch_list(),
    for data in test_data_loader():
        results = exe.run(program=test_program,
                          feed=data,
                          fetch_list=test_fetch_list,
                          return_numpy=False)
        update_metric(task, test_metric, results)
    test_metric.show()
Beispiel #3
0
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.

    model_config:
        a json file  with the hyperparameters,such as dropout rate ,learning rate,num tasks and so on;

    num_tasks:
        it means the number of task that each dataset contains, it's related to the dataset;
    
    DownstreamModel:
        It means the PretrainGNNModel for different strategies and it is an supervised GNN model which predicts the tasks.

    """
    model_config = json.load(open(args.model_config, 'r'))
    if not args.dropout_rate is None:
        model_config['dropout_rate'] = args.dropout_rate
    task_names = get_downstream_task_names(args.dataset_name, args.data_path)
    model_config['num_tasks'] = len(task_names)

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = DownstreamModel(model_config)
            model.train()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = DownstreamModel(model_config)
            model.train(is_test=True)

    # Use CUDAPlace for GPU training, or use CPUPlace for CPU training.
    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    # featurizer:
    #     Gen features according to the raw data and return the graph data.
    #     Collate features about the graph data and return the feed dictionary.
    # splitter:
    #     split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit.
    #     `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold,
    #     then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set
    #     and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on
    #     out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter`
    #     and `IndexSplitter` is also available."

    featurizer = DownstreamFeaturizer(model.graph_wrapper)
    dataset = get_dataset(args.dataset_name, args.data_path, task_names,
                          featurizer)
    splitter = create_splitter(args.split_type)
    train_dataset, valid_dataset, test_dataset = splitter.split(dataset,
                                                                frac_train=0.8,
                                                                frac_valid=0.1,
                                                                frac_test=0.1)
    print("Train/Valid/Test num: %s/%s/%s" %
          (len(train_dataset), len(valid_dataset), len(test_dataset)))

    ### start train
    # Load the train function and calculate the train loss in each epoch.
    # Here we set the epoch is in range of max epoch,you can change it if you want.

    # Then we will calculate the train loss ,valid auc,test auc and print them.
    # Finally we save it to the model according to the dataset.
    list_val_auc, list_test_auc = [], []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset,
                           featurizer)
        val_auc = evaluate(args, exe, test_prog, model, valid_dataset,
                           featurizer)
        test_auc = evaluate(args, exe, test_prog, model, test_dataset,
                            featurizer)

        list_val_auc.append(val_auc)
        list_test_auc.append(test_auc)
        test_auc_by_eval = list_test_auc[np.argmax(list_val_auc)]
        print("epoch:%s train/loss:%s" % (epoch_id, train_loss))
        print("epoch:%s val/auc:%s" % (epoch_id, val_auc))
        print("epoch:%s test/auc:%s" % (epoch_id, test_auc))
        print("epoch:%s test/auc_by_eval:%s" % (epoch_id, test_auc_by_eval))
        fluid.io.save_params(exe, '%s/epoch%d' % (args.model_dir, epoch_id),
                             train_prog)

    best_epoch_id = np.argmax(list_val_auc)
    fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id),
                         train_prog)
    fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog)
    return list_test_auc[best_epoch_id]
def main(args):
    """tbd"""
    model_config = json.load(open(args.model_config, 'r'))
    model_config['context_pooling'] = args.context_pooling

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward(is_test=True)

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    k = model_config['layer_num']
    l1 = k - 1
    l2 = l1 + args.context_size
    featurizer = PreGNNContextPredFeaturizer(
            model.substruct_graph_wrapper, 
            model.context_graph_wrapper, 
            k, l1, l2)
    dataset = load_zinc_dataset(args.data_path, featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(
            dataset, frac_train=0.9, frac_valid=0, frac_test=0.1)
    if args.distributed:
        indices = list(range(fleet.worker_num(), len(train_dataset), fleet.worker_index()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset, featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmax(list_test_loss)
        fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog)
        return list_test_loss[best_epoch_id]
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.

    model_config:
        a json file  with the  model configurations,such as dropout rate ,learning rate,num tasks and so on;
    task_num:
        It means the number of chembl filtered task;
    
    PreGNNSupervisedModel:
        It means the PretrainGNNModel for supervised strategy.
        Graph-level multi-task supervised pre-training to jointly predict a diverse set of supervised labels of individual graphs.
    """
    model_config = json.load(open(args.model_config, 'r'))
    if not args.dropout_rate is None:
        model_config['dropout_rate'] = args.dropout_rate
    task_num = get_chembl_filtered_task_num()
    model_config['task_num'] = task_num

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNSupervisedModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNSupervisedModel(model_config)
            model.forward(is_test=True)
    """
    Use CUDAPlace for GPU training, or use CPUPlace for CPU training.

    """

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    """
    PreGNNSupervisedFeaturizer:
        It is used along with `PreGNNSupervised`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch.
        
    splitter:
        split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit.
        `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, 
        then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set 
        and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on 
        out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` 
        and `IndexSplitter` is also available."
    
    """
    featurizer = PreGNNSupervisedFeaturizer(model.graph_wrapper)
    dataset = load_chembl_filtered_dataset(args.data_path,
                                           featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(dataset,
                                                    frac_train=0.9,
                                                    frac_valid=0,
                                                    frac_test=0.1)
    if args.distributed:
        indices = list(
            range(fleet.worker_index(), len(train_dataset),
                  fleet.worker_num()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    """
    Load the train function and calculate the train loss and test loss in each epoch.
    Here we set the epoch is in range of max epoch,you can change it if you want. 

    Then we will calculate the train loss ,test loss and print them.
    Finally we save the best epoch to the model according to the dataset.

    """
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset,
                           featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset,
                             featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe,
                                 '%s/epoch%s' % (args.model_dir, epoch_id),
                                 train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmax(list_test_loss)
        fluid.io.load_params(exe,
                             '%s/epoch%d' % (args.model_dir, best_epoch_id),
                             train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir),
                             train_prog)
        return list_test_loss[best_epoch_id]
Beispiel #6
0
def main(args):
    """tbd"""
    model_config = json.load(open(args.model_config, 'r'))
    task_names = get_downstream_task_names(args.dataset_name, args.data_path)
    model_config['num_tasks'] = len(task_names)

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = DownstreamModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = DownstreamModel(model_config)
            model.forward(is_test=True)

    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    featurizer = DownstreamFeaturizer(model.graph_wrapper)
    dataset = get_dataset(args.dataset_name, args.data_path, task_names,
                          featurizer)
    splitter = create_splitter(args.split_type)
    train_dataset, valid_dataset, test_dataset = splitter.split(dataset,
                                                                frac_train=0.8,
                                                                frac_valid=0.1,
                                                                frac_test=0.1)
    print("Train/Valid/Test num: %s/%s/%s" %
          (len(train_dataset), len(valid_dataset), len(test_dataset)))

    ### start train
    list_val_auc, list_test_auc = [], []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset,
                           featurizer)
        val_auc = evaluate(args, exe, test_prog, model, valid_dataset,
                           featurizer)
        test_auc = evaluate(args, exe, test_prog, model, test_dataset,
                            featurizer)

        list_val_auc.append(val_auc)
        list_test_auc.append(test_auc)
        test_auc_by_eval = list_test_auc[np.argmax(list_val_auc)]
        print("epoch:%s train/loss:%s" % (epoch_id, train_loss))
        print("epoch:%s val/auc:%s" % (epoch_id, val_auc))
        print("epoch:%s test/auc:%s" % (epoch_id, test_auc))
        print("epoch:%s test/auc_by_eval:%s" % (epoch_id, test_auc_by_eval))
        fluid.io.save_params(exe, '%s/epoch%d' % (args.model_dir, epoch_id),
                             train_prog)

    best_epoch_id = np.argmax(list_val_auc)
    fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id),
                         train_prog)
    fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog)
    return list_test_auc[best_epoch_id]
Beispiel #7
0
def main(args):
    # Enable static graph mode.
    paddle.enable_static()

    with open(args.config, 'r') as f:
        config = json.load(f)

    logging.info('Load data ...')
    train_data_list = load_data(args.train_data)
    valid_data_list = load_data(args.valid_data)
    test_data_list = load_data(args.test_data)

    logging.info("Data loaded.")
    logging.info("Train Examples: %s" % len(train_data_list))
    logging.info("Val Examples: %s" % len(valid_data_list))
    logging.info("Test Examples: %s" % len(test_data_list))
    logging.info("Num Tasks: %s" % args.num_tasks)
    sys.stdout.flush()

    train_prog = F.Program()
    test_prog = F.Program()
    startup_prog = F.Program()
    with F.program_guard(train_prog, startup_prog):
        with F.unique_name.guard():
            graph_label = L.data(name="label",
                                 shape=[None, args.num_tasks],
                                 dtype="float32")
            agent = create_model(args, config, graph_label)
            test_prog = train_prog.clone(for_test=True)

            opt = F.optimizer.Adam(learning_rate=args.lr)
            opt.minimize(agent.loss)

    place = F.CUDAPlace(0) if args.use_cuda else F.CPUPlace()
    exe = F.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)
        logging.info('Loaded %s' % args.init_model)

    list_val_auc, list_test_auc, best_val_auc = [], [], 0
    best_model = os.path.join(args.model_dir, 'best_model')
    for epoch_id in range(args.max_epoch):
        train(args, exe, train_prog, agent, train_data_list, epoch_id)
        val_auc = evaluate(args, exe, test_prog, agent, valid_data_list,
                           epoch_id)
        test_auc = evaluate(args, exe, test_prog, agent, test_data_list,
                            epoch_id)
        list_val_auc.append(val_auc)
        list_test_auc.append(test_auc)

        if best_val_auc < val_auc:
            if os.path.exists(best_model):
                shutil.rmtree(best_model)

            F.io.save_params(exe, best_model, train_prog)
            best_val_auc = val_auc

        test_auc_by_eval = list_test_auc[np.argmax(list_val_auc)]
        logging.info('%s Epoch %d val/auc: %f' % (args.exp, epoch_id, val_auc))
        logging.info('%s Epoch %d test/auc: %f' %
                     (args.exp, epoch_id, test_auc))
        logging.info('%s Epoch %d test/auc_by_eval: %f' % \
                     (args.exp, epoch_id, test_auc_by_eval))

    logging.info('%s final/test/auc_by_eval: %f' %
                 (args.exp, test_auc_by_eval))

    with open(os.path.join(args.log_dir, 'metric.json'), 'w') as f:
        best_epoch = int(np.argmax(list_val_auc))
        metric = {
            'val_auc': list_val_auc,
            'test_auc': list_test_auc,
            'best_epoch': best_epoch,
            'best_test_auc': list_test_auc[best_epoch],
            'init_model': '' if args.init_model is None else args.init_model
        }
        f.write(json.dumps(metric))
Beispiel #8
0
def main(args):
    paddle.enable_static()

    model_config = json.load(open(args.model_config, 'r'))

    exe_params = default_exe_params(args.is_distributed, args.use_cuda,
                                    args.thread_num)
    exe = exe_params['exe']
    trainer_num = exe_params['trainer_num']
    trainer_id = exe_params['trainer_id']
    dist_strategy = exe_params['dist_strategy']
    places = exe_params['places']

    task = model_config['task']

    model = TAPEModel(model_config=model_config, name=task)

    train_program = fluid.Program()
    train_startup = fluid.Program()
    with fluid.program_guard(train_program, train_startup):
        with fluid.unique_name.guard():
            model.forward(False)
            model.cal_loss()

            optimizer = default_optimizer(args.lr, args.warmup_steps,
                                          args.max_grad_norm)
            setup_optimizer(optimizer, model, args.use_cuda,
                            args.is_distributed, dist_strategy)

            optimizer.minimize(model.loss)

            train_data_loader = setup_data_loader(model.input_list,
                                                  model_config,
                                                  args.train_data, trainer_id,
                                                  trainer_num, places,
                                                  args.batch_size)
            exe.run(train_startup)

    train_metric = get_metric(task)
    train_fetch_list = model.get_fetch_list()

    if args.test_data is not None:
        test_program = fluid.Program()
        test_startup = fluid.Program()
        with fluid.program_guard(test_program, test_startup):
            with fluid.unique_name.guard():
                model.forward(True)
                model.cal_loss()
                test_data_loader = setup_data_loader(model, model_config,
                                                     args.test_data,
                                                     trainer_id, trainer_num,
                                                     places, args.batch_size)
                exe.run(test_startup)
        test_metric = get_metric(task)
        test_fetch_list = model.get_fetch_list()

    if not args.is_distributed:
        train_program = fluid.compiler.CompiledProgram(
            train_program).with_data_parallel(loss_name=model.loss.name)
        if args.test_data is not None and args.test_data != "":
            test_program = fluid.compiler.CompiledProgram(
                test_program).with_data_parallel(loss_name=model.loss.name)

    if args.init_model is not None and args.init_model != "":
        load_partial_params(exe, args.init_model, test_program)

    for epoch_id in range(args.max_epoch):
        print(time.time(), 'Start epoch %d' % epoch_id)
        print('Train:')
        train_metric.clear()
        for data in train_data_loader():
            results = exe.run(program=train_program,
                              feed=data,
                              fetch_list=train_fetch_list,
                              return_numpy=False)
            update_metric(task, train_metric, results)
            train_metric.show()
        print()

        if args.test_data is not None and args.test_data != "":
            print('Test:')
            test_metric.clear()
            for data in test_data_loader():
                results = exe.run(program=test_program,
                                  feed=data,
                                  fetch_list=test_fetch_list,
                                  return_numpy=False)
                update_metric(task, test_metric, results)
            test_metric.show()
            print()

        if trainer_id == 0:
            print(time.time(), "Save model epoch%d." % epoch_id)

            is_exist = os.path.exists(args.model_dir)
            if not is_exist:
                os.makedirs(args.model_dir)
            fluid.io.save_params(exe,
                                 '%s/epoch%d' % (args.model_dir, epoch_id),
                                 train_program)
Beispiel #9
0
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.

    model_config:
        a json file  with the  model configurations,such as dropout rate ,learning rate,num tasks and so on;

    context_pooling:
        it means the pooling type of context prediction;
    
    PreGNNContextpredModel:
        It is an unsupervised pretraining model which use subgraphs to predict their surrounding graph structures. Our goal is to pre-train a GNN so that it maps nodes appearing in similar structural contexts to nearby embeddings.

    """
    model_config = json.load(open(args.model_config, 'r'))
    if not args.dropout_rate is None:
        model_config['dropout_rate'] = args.dropout_rate
    model_config['context_pooling'] = args.context_pooling

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward(is_test=True)

    # Use CUDAPlace for GPU training, or use CPUPlace for CPU training.
    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    # PreGNNContextPredFeaturizer:
    #     It is used along with `PreGNNContextPredModel`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch.
    # k is the number of layer,l1 and l2 are the different size of context,usually l1 < l2.
    # splitter:
    #     split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit.
    #     `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, 
    #     then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set 
    #     and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on 
    #     out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` 
    #     and `IndexSplitter` is also available."
    k = model_config['layer_num']
    l1 = k - 1
    l2 = l1 + args.context_size
    featurizer = PreGNNContextPredFeaturizer(
            model.substruct_graph_wrapper, 
            model.context_graph_wrapper, 
            k, l1, l2)
    dataset = load_zinc_dataset(args.data_path, featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(
            dataset, frac_train=0.9, frac_valid=0, frac_test=0.1)
    if args.distributed:
        indices = list(range(fleet.worker_index(), len(train_dataset), fleet.worker_num()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    # Load the train function and calculate the train loss and test loss in each epoch.
    # Here we set the epoch is in range of max epoch,you can change it if you want. 

    # Then we will calculate the train loss ,test loss and print them.
    # Finally we save the best epoch to the model according to the dataset.
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset, featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmin(list_test_loss)
        fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog)
        return list_test_loss[best_epoch_id]
Beispiel #10
0
def main(args):
    # Enable static graph mode.
    paddle.enable_static()

    model_config = json.load(open(args.model_config, 'r'))

    exe_params = default_exe_params(args.is_distributed, args.use_cuda, args.thread_num)
    exe = exe_params['exe']

    selector = None if not args.use_val else lambda l: l[:int(len(l)*0.8)]
    train_dataset = DTADataset(
        args.train_data, exe_params['trainer_id'],
        exe_params['trainer_num'],
        max_protein_len=model_config['protein']['max_protein_len'],
        subset_selector=selector)
    test_dataset = DTADataset(
        args.test_data, exe_params['trainer_id'],
        exe_params['trainer_num'],
        max_protein_len=model_config['protein']['max_protein_len'])

    if args.use_val:
        selector = lambda l: l[int(len(l)*0.8):]
        val_dataset = DTADataset(
            args.train_data, exe_params['trainer_id'],
            exe_params['trainer_num'],
            max_protein_len=model_config['protein']['max_protein_len'],
            subset_selector=selector)

    train_program = fluid.Program()
    train_startup = fluid.Program()
    with fluid.program_guard(train_program, train_startup):
        with fluid.unique_name.guard():
            model = DTAModel(
                model_config=model_config,
                use_pretrained_compound_gnns=args.use_pretrained_compound_gnns)
            model.train()
            test_program = train_program.clone(for_test=True)
            optimizer = fluid.optimizer.Adam(learning_rate=args.lr)
            setup_optimizer(optimizer, args.use_cuda, args.is_distributed)
            optimizer.minimize(model.loss)

    exe.run(train_startup)
    if args.init_model is not None and args.init_model != "":
        load_partial_params(exe, args.init_model, train_program)

    config = os.path.basename(args.model_config)
    best_mse, best_mse_, best_ci, best_ep = np.inf, np.inf, 0, 0
    best_model = os.path.join(args.model_dir, 'best_model')
    for epoch_id in range(1, args.max_epoch + 1):
        logging.info('========== Epoch {} =========='.format(epoch_id))
        train_loss = train(args, exe, train_program, model, train_dataset)
        logging.info('#{} Epoch: {}, Train loss: {}'.format(
            config, epoch_id, train_loss))
        metrics = evaluate(
            args, exe, test_program, model, test_dataset, best_mse_,
            val_dataset=None if not args.use_val else val_dataset)

        if args.use_val:
            mse, test_mse, test_ci = metrics
        else:
            mse, ci = metrics

        if mse < best_mse_:
            best_ep = epoch_id
            if os.path.exists(best_model):
                shutil.rmtree(best_model)
            fluid.io.save_params(exe, best_model, train_program)

        if not args.use_val and mse < best_mse_:
            best_mse, best_mse_, best_ci = mse, mse, ci
            save_metric(args.model_dir, epoch_id, best_mse, best_ci)
        elif args.use_val and mse < best_mse_:
            best_mse, best_mse_, best_ci = test_mse, mse, test_ci
            save_metric(args.model_dir, epoch_id, best_mse, best_ci)
        else:
            logging.info('No improvement in epoch {}'.format(epoch_id))
            metric = open(os.path.join(args.model_dir, 'eval.txt'), 'r').read()
            logging.info('===== Current best:\n{}'.format(metric))