def test_train_model_on_task_with_FITBNameGraphVocabGGNN(self):
     preprocess_task_for_model(
         234,
         'FITBTask',
         self.task_filepath,
         'FITBNameGraphVocabGGNN',
         dataset_output_dir=self.output_dataset_dir,
         n_jobs=30,
         excluded_edge_types=frozenset(),
         data_encoder='new',
         data_encoder_kwargs=dict(max_name_encoding_length=10),
         instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100))
     train(seed=1523,
           log_dir=self.log_dir,
           gpu_ids=(0, 1, 2, 3),
           model_name='FITBNameGraphVocabGGNN',
           data_encoder_filepath=os.path.join(
               self.output_dataset_dir,
               '{}.pkl'.format(FITBNameGraphVocab.DataEncoder.__name__)),
           model_kwargs=dict(hidden_size=50,
                             type_emb_size=15,
                             name_emb_size=15,
                             n_msg_pass_iters=3),
           init_fxn_name='Xavier',
           init_fxn_kwargs=dict(),
           loss_fxn_name='FITBLoss',
           loss_fxn_kwargs=dict(),
           optimizer_name='Adam',
           optimizer_kwargs={'learning_rate': .0002},
           train_data_directory=self.output_dataset_dir,
           val_fraction=0.15,
           n_workers=4,
           n_epochs=2,
           evaluation_metrics=('evaluate_FITB_accuracy', ),
           n_batch=256,
           debug=True)
 def test_train_model_on_task_memorize_minibatch_no_subtoken_edges_with_VarNamingNameGraphVocabGGNN(
         self):
     preprocess_task_for_model(
         234,
         'VarNamingTask',
         self.task_filepath,
         'VarNamingNameGraphVocabGGNN',
         dataset_output_dir=self.output_dataset_dir,
         n_jobs=30,
         excluded_edge_types=frozenset(),
         data_encoder='new',
         data_encoder_kwargs=dict(max_name_encoding_length=30,
                                  add_edges=False),
         instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100))
     for f in [
             os.path.join(self.output_dataset_dir, f)
             for f in os.listdir(self.output_dataset_dir)
             if 'DataEncoder' not in f
     ][self.minibatch_size:]:
         os.remove(f)
     _, wordwise_accuracy = train(
         seed=1525,
         log_dir=self.log_dir,
         gpu_ids=(0, 1),
         model_name='VarNamingNameGraphVocabGGNN',
         data_encoder_filepath=os.path.join(
             self.output_dataset_dir, '{}.pkl'.format(
                 VarNamingNameGraphVocabGGNN.DataEncoder.__name__)),
         model_kwargs=dict(hidden_size=128,
                           type_emb_size=30,
                           name_emb_size=30,
                           n_msg_pass_iters=3,
                           max_name_length=8),
         init_fxn_name='Xavier',
         init_fxn_kwargs=dict(),
         loss_fxn_name='VarNamingGraphVocabLoss',
         loss_fxn_kwargs=dict(),
         optimizer_name='Adam',
         optimizer_kwargs={'learning_rate': .0005},
         train_data_directory=self.output_dataset_dir,
         val_fraction=0.15,
         n_workers=4,
         n_epochs=15,
         evaluation_metrics=('evaluate_full_name_accuracy', ),
         n_batch=(len(os.listdir(self.output_dataset_dir)) - 1) * 10,
         test=True)
     self.assertGreaterEqual(wordwise_accuracy, 0.8)
 def test_train_model_on_task_memorize_minibatch_with_FITBFixedVocabGAT(
         self):
     preprocess_task_for_model(
         234,
         'FITBTask',
         self.task_filepath,
         'FITBFixedVocabGAT',
         dataset_output_dir=self.output_dataset_dir,
         n_jobs=30,
         excluded_edge_types=frozenset(),
         data_encoder='new',
         data_encoder_kwargs=dict(),
         instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100))
     for f in [
             os.path.join(self.output_dataset_dir, f)
             for f in os.listdir(self.output_dataset_dir)
             if 'DataEncoder' not in f
     ][self.minibatch_size:]:
         os.remove(f)
     _, accuracy = train(
         seed=1525,
         log_dir=self.log_dir,
         gpu_ids=(0, 1),
         model_name='FITBFixedVocabGAT',
         data_encoder_filepath=os.path.join(
             self.output_dataset_dir,
             '{}.pkl'.format(FITBFixedVocabGAT.DataEncoder.__name__)),
         model_kwargs=dict(hidden_size=128,
                           n_multi_attention_heads=2,
                           type_emb_size=30,
                           name_emb_size=30,
                           n_msg_pass_iters=2),
         init_fxn_name='Xavier',
         init_fxn_kwargs=dict(),
         loss_fxn_name='FITBLoss',
         loss_fxn_kwargs=dict(),
         optimizer_name='Adam',
         optimizer_kwargs={'learning_rate': .00075},
         train_data_directory=self.output_dataset_dir,
         val_fraction=0.15,
         n_workers=4,
         n_epochs=8,
         evaluation_metrics=('evaluate_FITB_accuracy', ),
         n_batch=(len(os.listdir(self.output_dataset_dir)) - 1) * 10,
         test=True)
     self.assertGreaterEqual(accuracy, 0.7)
def train_model_for_experiment(dataset_name: str,
                               experiment_name: str,
                               experiment_run_log_id: str,
                               seed: int,
                               gpu_ids: Tuple[int, ...],
                               model_name: str,
                               model_label: str,
                               model_kwargs: dict,
                               init_fxn_name: str,
                               init_fxn_kwargs: dict,
                               loss_fxn_name: str,
                               loss_fxn_kwargs: dict,
                               optimizer_name: str,
                               optimizer_kwargs: dict,
                               val_fraction: float,
                               n_workers: int,
                               n_epochs: int,
                               evaluation_metrics: [str],
                               n_batch: int,
                               debug: bool = False,
                               skip_s3_sync=False,
                               test: bool = False):
    # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3
    train_data_dir_suffix = os.path.join(dataset_name, 'experiments',
                                         experiment_name, 'seen_repos',
                                         'train_graphs')
    if test:
        s3shared_local_path = test_s3shared_path
    else:
        from experiments import s3shared_local_path, s3shared_cloud_path
        if not skip_s3_sync:
            s3_sync(
                os.path.join(
                    s3shared_cloud_path, train_data_dir_suffix,
                    '_'.join([model_name, model_label, 'preprocessed_data'])),
                os.path.join(
                    s3shared_local_path, train_data_dir_suffix,
                    '_'.join([model_name, model_label, 'preprocessed_data'])))
    local_train_dir = os.path.join(s3shared_local_path, train_data_dir_suffix)

    model_class = models.__dict__[model_name]

    log_dir_suffix = os.path.join('logs', experiment_run_log_id,
                                  '_'.join([model_name, model_label]))
    log_dir = os.path.join(local_train_dir, log_dir_suffix)
    train_data_dir = os.path.join(
        local_train_dir,
        '_'.join([model_name, model_label, 'preprocessed_data']))
    if test:
        s3_cloud_log_path = None
    else:
        s3_cloud_log_path = os.path.join(s3shared_cloud_path,
                                         train_data_dir_suffix, log_dir_suffix)
    train(seed=seed,
          log_dir=log_dir,
          gpu_ids=gpu_ids,
          model_name=model_name,
          data_encoder_filepath=os.path.join(
              train_data_dir,
              '{}.pkl'.format(model_class.DataEncoder.__name__)),
          model_kwargs=model_kwargs,
          init_fxn_name=init_fxn_name,
          init_fxn_kwargs=init_fxn_kwargs,
          loss_fxn_name=loss_fxn_name,
          loss_fxn_kwargs=loss_fxn_kwargs,
          optimizer_name=optimizer_name,
          optimizer_kwargs=optimizer_kwargs,
          train_data_directory=train_data_dir,
          val_fraction=val_fraction,
          n_workers=n_workers,
          n_epochs=n_epochs,
          evaluation_metrics=evaluation_metrics,
          n_batch=n_batch,
          s3shared_cloud_log_path=s3_cloud_log_path,
          debug=debug)
Ejemplo n.º 5
0
 def test_evaluate_model_with_VarNamingNameGraphVocabGGNN(self):
     preprocess_task_for_model(
         seed=234,
         task_class_name='VarNamingTask',
         task_filepath=self.train_task_filepath,
         model_name='VarNamingNameGraphVocabGGNN',
         dataset_output_dir=self.train_output_dataset_dir,
         n_jobs=30,
         excluded_edge_types=frozenset(['LAST_WRITE']),
         data_encoder='new',
         data_encoder_kwargs=dict(max_name_encoding_length=10),
         instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100))
     data_encoder = os.path.join(self.train_output_dataset_dir,
                                 'VarNamingNameGraphVocabDataEncoder.pkl')
     preprocess_task_for_model(
         seed=235,
         task_class_name='VarNamingTask',
         task_filepath=self.test_task_filepath,
         model_name='VarNamingNameGraphVocabGGNN',
         dataset_output_dir=self.test_output_dataset_dir,
         n_jobs=30,
         excluded_edge_types=frozenset(),
         data_encoder=data_encoder)
     train(seed=1523,
           log_dir=self.train_log_dir,
           gpu_ids=(0, 1, 2),
           model_name='VarNamingNameGraphVocabGGNN',
           data_encoder_filepath=os.path.join(
               self.train_output_dataset_dir, '{}.pkl'.format(
                   VarNamingNameGraphVocabGGNN.DataEncoder.__name__)),
           model_kwargs=dict(hidden_size=21,
                             type_emb_size=23,
                             name_emb_size=17,
                             n_msg_pass_iters=2,
                             max_name_length=4),
           init_fxn_name='Xavier',
           init_fxn_kwargs=dict(),
           loss_fxn_name='VarNamingLoss',
           loss_fxn_kwargs=dict(),
           optimizer_name='Adam',
           optimizer_kwargs={'learning_rate': .0002},
           train_data_directory=self.train_output_dataset_dir,
           val_fraction=0.15,
           n_workers=4,
           n_epochs=2,
           evaluation_metrics=('evaluate_full_name_accuracy',
                               'evaluate_subtokenwise_accuracy',
                               'evaluate_edit_distance'),
           n_batch=63)
     model_checkpoint_path = os.path.join(self.train_log_dir, 'model.pkl')
     model_params_path = os.path.join(self.train_log_dir, 'best.params')
     evaluate_model(seed=619,
                    log_dir=self.test_log_dir,
                    gpu_ids=(0, 1),
                    model_name='VarNamingNameGraphVocabGGNN',
                    model_filepath=model_checkpoint_path,
                    model_params_filepath=model_params_path,
                    test_data_directory=self.test_output_dataset_dir,
                    n_workers=5,
                    n_batch=68,
                    evaluation_metrics=('evaluate_full_name_accuracy',
                                        'evaluate_subtokenwise_accuracy',
                                        'evaluate_edit_distance'))
Ejemplo n.º 6
0
 def test_evaluate_gives_the_same_results_as_in_training_loop_with_FITBFixedVocabGGNN(
         self):
     preprocess_task_for_model(
         seed=234,
         task_class_name='FITBTask',
         task_filepath=self.train_task_filepath,
         model_name='FITBFixedVocabGGNN',
         dataset_output_dir=self.train_output_dataset_dir,
         n_jobs=30,
         excluded_edge_types=frozenset(),
         data_encoder='new',
         data_encoder_kwargs=dict(),
         instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100))
     data_encoder = os.path.join(self.train_output_dataset_dir,
                                 'FITBFixedVocabDataEncoder.pkl')
     preprocess_task_for_model(
         seed=235,
         task_class_name='FITBTask',
         task_filepath=self.test_task_filepath,
         model_name='FITBFixedVocabGGNN',
         dataset_output_dir=self.test_output_dataset_dir,
         n_jobs=30,
         excluded_edge_types=frozenset(),
         data_encoder=data_encoder)
     val_data, train_FITB_eval_accuracy = train(
         seed=1523,
         log_dir=self.train_log_dir,
         gpu_ids=(0, 1, 2),
         model_name='FITBFixedVocabGGNN',
         data_encoder_filepath=os.path.join(
             self.train_output_dataset_dir,
             '{}.pkl'.format(FITBFixedVocabGGNN.DataEncoder.__name__)),
         model_kwargs=dict(hidden_size=21,
                           type_emb_size=23,
                           name_emb_size=17,
                           n_msg_pass_iters=2),
         init_fxn_name='Xavier',
         init_fxn_kwargs=dict(),
         loss_fxn_name='FITBLoss',
         loss_fxn_kwargs=dict(),
         optimizer_name='Adam',
         optimizer_kwargs={'learning_rate': .0002},
         train_data_directory=self.train_output_dataset_dir,
         val_fraction=0.15,
         n_workers=4,
         n_epochs=2,
         evaluation_metrics=('evaluate_FITB_accuracy', ),
         n_batch=63)
     for f in [
             os.path.join(self.train_output_dataset_dir, f)
             for f in os.listdir(self.train_output_dataset_dir)
     ]:
         if f not in val_data and f != os.path.join(
                 self.train_output_dataset_dir,
                 'FITBFixedVocabDataEncoder.pkl'):
             os.remove(f)
     model_checkpoint_path = os.path.join(self.train_log_dir, 'model.pkl')
     model_params_path = os.path.join(self.train_log_dir,
                                      'model_checkpoint_epoch_1.params')
     test_FITB_eval_accuracy = evaluate_model(
         seed=619,
         log_dir=self.test_log_dir,
         gpu_ids=(0, 1),
         model_name='FITBFixedVocabGGNN',
         model_filepath=model_checkpoint_path,
         model_params_filepath=model_params_path,
         test_data_directory=self.train_output_dataset_dir,
         n_workers=5,
         n_batch=68,
         evaluation_metrics=('evaluate_FITB_accuracy', ))
     self.assertEqual(train_FITB_eval_accuracy, test_FITB_eval_accuracy)
     model_params_path = os.path.join(self.train_log_dir, 'best.params')
     test_FITB_eval_accuracy = evaluate_model(
         seed=214,
         log_dir=self.test_log_dir,
         gpu_ids=(0, ),
         model_name='FITBFixedVocabGGNN',
         model_filepath=model_checkpoint_path,
         model_params_filepath=model_params_path,
         test_data_directory=self.train_output_dataset_dir,
         n_workers=5,
         n_batch=55,
         evaluation_metrics=('evaluate_FITB_accuracy', ))
     self.assertEqual(train_FITB_eval_accuracy, test_FITB_eval_accuracy)