def test_train_model_on_task_with_FITBNameGraphVocabGGNN(self): preprocess_task_for_model( 234, 'FITBTask', self.task_filepath, 'FITBNameGraphVocabGGNN', dataset_output_dir=self.output_dataset_dir, n_jobs=30, excluded_edge_types=frozenset(), data_encoder='new', data_encoder_kwargs=dict(max_name_encoding_length=10), instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100)) train(seed=1523, log_dir=self.log_dir, gpu_ids=(0, 1, 2, 3), model_name='FITBNameGraphVocabGGNN', data_encoder_filepath=os.path.join( self.output_dataset_dir, '{}.pkl'.format(FITBNameGraphVocab.DataEncoder.__name__)), model_kwargs=dict(hidden_size=50, type_emb_size=15, name_emb_size=15, n_msg_pass_iters=3), init_fxn_name='Xavier', init_fxn_kwargs=dict(), loss_fxn_name='FITBLoss', loss_fxn_kwargs=dict(), optimizer_name='Adam', optimizer_kwargs={'learning_rate': .0002}, train_data_directory=self.output_dataset_dir, val_fraction=0.15, n_workers=4, n_epochs=2, evaluation_metrics=('evaluate_FITB_accuracy', ), n_batch=256, debug=True)
def test_train_model_on_task_memorize_minibatch_no_subtoken_edges_with_VarNamingNameGraphVocabGGNN( self): preprocess_task_for_model( 234, 'VarNamingTask', self.task_filepath, 'VarNamingNameGraphVocabGGNN', dataset_output_dir=self.output_dataset_dir, n_jobs=30, excluded_edge_types=frozenset(), data_encoder='new', data_encoder_kwargs=dict(max_name_encoding_length=30, add_edges=False), instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100)) for f in [ os.path.join(self.output_dataset_dir, f) for f in os.listdir(self.output_dataset_dir) if 'DataEncoder' not in f ][self.minibatch_size:]: os.remove(f) _, wordwise_accuracy = train( seed=1525, log_dir=self.log_dir, gpu_ids=(0, 1), model_name='VarNamingNameGraphVocabGGNN', data_encoder_filepath=os.path.join( self.output_dataset_dir, '{}.pkl'.format( VarNamingNameGraphVocabGGNN.DataEncoder.__name__)), model_kwargs=dict(hidden_size=128, type_emb_size=30, name_emb_size=30, n_msg_pass_iters=3, max_name_length=8), init_fxn_name='Xavier', init_fxn_kwargs=dict(), loss_fxn_name='VarNamingGraphVocabLoss', loss_fxn_kwargs=dict(), optimizer_name='Adam', optimizer_kwargs={'learning_rate': .0005}, train_data_directory=self.output_dataset_dir, val_fraction=0.15, n_workers=4, n_epochs=15, evaluation_metrics=('evaluate_full_name_accuracy', ), n_batch=(len(os.listdir(self.output_dataset_dir)) - 1) * 10, test=True) self.assertGreaterEqual(wordwise_accuracy, 0.8)
def test_train_model_on_task_memorize_minibatch_with_FITBFixedVocabGAT( self): preprocess_task_for_model( 234, 'FITBTask', self.task_filepath, 'FITBFixedVocabGAT', dataset_output_dir=self.output_dataset_dir, n_jobs=30, excluded_edge_types=frozenset(), data_encoder='new', data_encoder_kwargs=dict(), instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100)) for f in [ os.path.join(self.output_dataset_dir, f) for f in os.listdir(self.output_dataset_dir) if 'DataEncoder' not in f ][self.minibatch_size:]: os.remove(f) _, accuracy = train( seed=1525, log_dir=self.log_dir, gpu_ids=(0, 1), model_name='FITBFixedVocabGAT', data_encoder_filepath=os.path.join( self.output_dataset_dir, '{}.pkl'.format(FITBFixedVocabGAT.DataEncoder.__name__)), model_kwargs=dict(hidden_size=128, n_multi_attention_heads=2, type_emb_size=30, name_emb_size=30, n_msg_pass_iters=2), init_fxn_name='Xavier', init_fxn_kwargs=dict(), loss_fxn_name='FITBLoss', loss_fxn_kwargs=dict(), optimizer_name='Adam', optimizer_kwargs={'learning_rate': .00075}, train_data_directory=self.output_dataset_dir, val_fraction=0.15, n_workers=4, n_epochs=8, evaluation_metrics=('evaluate_FITB_accuracy', ), n_batch=(len(os.listdir(self.output_dataset_dir)) - 1) * 10, test=True) self.assertGreaterEqual(accuracy, 0.7)
def train_model_for_experiment(dataset_name: str, experiment_name: str, experiment_run_log_id: str, seed: int, gpu_ids: Tuple[int, ...], model_name: str, model_label: str, model_kwargs: dict, init_fxn_name: str, init_fxn_kwargs: dict, loss_fxn_name: str, loss_fxn_kwargs: dict, optimizer_name: str, optimizer_kwargs: dict, val_fraction: float, n_workers: int, n_epochs: int, evaluation_metrics: [str], n_batch: int, debug: bool = False, skip_s3_sync=False, test: bool = False): # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3 train_data_dir_suffix = os.path.join(dataset_name, 'experiments', experiment_name, 'seen_repos', 'train_graphs') if test: s3shared_local_path = test_s3shared_path else: from experiments import s3shared_local_path, s3shared_cloud_path if not skip_s3_sync: s3_sync( os.path.join( s3shared_cloud_path, train_data_dir_suffix, '_'.join([model_name, model_label, 'preprocessed_data'])), os.path.join( s3shared_local_path, train_data_dir_suffix, '_'.join([model_name, model_label, 'preprocessed_data']))) local_train_dir = os.path.join(s3shared_local_path, train_data_dir_suffix) model_class = models.__dict__[model_name] log_dir_suffix = os.path.join('logs', experiment_run_log_id, '_'.join([model_name, model_label])) log_dir = os.path.join(local_train_dir, log_dir_suffix) train_data_dir = os.path.join( local_train_dir, '_'.join([model_name, model_label, 'preprocessed_data'])) if test: s3_cloud_log_path = None else: s3_cloud_log_path = os.path.join(s3shared_cloud_path, train_data_dir_suffix, log_dir_suffix) train(seed=seed, log_dir=log_dir, gpu_ids=gpu_ids, model_name=model_name, data_encoder_filepath=os.path.join( train_data_dir, '{}.pkl'.format(model_class.DataEncoder.__name__)), model_kwargs=model_kwargs, init_fxn_name=init_fxn_name, init_fxn_kwargs=init_fxn_kwargs, loss_fxn_name=loss_fxn_name, loss_fxn_kwargs=loss_fxn_kwargs, optimizer_name=optimizer_name, optimizer_kwargs=optimizer_kwargs, train_data_directory=train_data_dir, val_fraction=val_fraction, n_workers=n_workers, n_epochs=n_epochs, evaluation_metrics=evaluation_metrics, n_batch=n_batch, s3shared_cloud_log_path=s3_cloud_log_path, debug=debug)
def test_evaluate_model_with_VarNamingNameGraphVocabGGNN(self): preprocess_task_for_model( seed=234, task_class_name='VarNamingTask', task_filepath=self.train_task_filepath, model_name='VarNamingNameGraphVocabGGNN', dataset_output_dir=self.train_output_dataset_dir, n_jobs=30, excluded_edge_types=frozenset(['LAST_WRITE']), data_encoder='new', data_encoder_kwargs=dict(max_name_encoding_length=10), instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100)) data_encoder = os.path.join(self.train_output_dataset_dir, 'VarNamingNameGraphVocabDataEncoder.pkl') preprocess_task_for_model( seed=235, task_class_name='VarNamingTask', task_filepath=self.test_task_filepath, model_name='VarNamingNameGraphVocabGGNN', dataset_output_dir=self.test_output_dataset_dir, n_jobs=30, excluded_edge_types=frozenset(), data_encoder=data_encoder) train(seed=1523, log_dir=self.train_log_dir, gpu_ids=(0, 1, 2), model_name='VarNamingNameGraphVocabGGNN', data_encoder_filepath=os.path.join( self.train_output_dataset_dir, '{}.pkl'.format( VarNamingNameGraphVocabGGNN.DataEncoder.__name__)), model_kwargs=dict(hidden_size=21, type_emb_size=23, name_emb_size=17, n_msg_pass_iters=2, max_name_length=4), init_fxn_name='Xavier', init_fxn_kwargs=dict(), loss_fxn_name='VarNamingLoss', loss_fxn_kwargs=dict(), optimizer_name='Adam', optimizer_kwargs={'learning_rate': .0002}, train_data_directory=self.train_output_dataset_dir, val_fraction=0.15, n_workers=4, n_epochs=2, evaluation_metrics=('evaluate_full_name_accuracy', 'evaluate_subtokenwise_accuracy', 'evaluate_edit_distance'), n_batch=63) model_checkpoint_path = os.path.join(self.train_log_dir, 'model.pkl') model_params_path = os.path.join(self.train_log_dir, 'best.params') evaluate_model(seed=619, log_dir=self.test_log_dir, gpu_ids=(0, 1), model_name='VarNamingNameGraphVocabGGNN', model_filepath=model_checkpoint_path, model_params_filepath=model_params_path, test_data_directory=self.test_output_dataset_dir, n_workers=5, n_batch=68, evaluation_metrics=('evaluate_full_name_accuracy', 'evaluate_subtokenwise_accuracy', 'evaluate_edit_distance'))
def test_evaluate_gives_the_same_results_as_in_training_loop_with_FITBFixedVocabGGNN( self): preprocess_task_for_model( seed=234, task_class_name='FITBTask', task_filepath=self.train_task_filepath, model_name='FITBFixedVocabGGNN', dataset_output_dir=self.train_output_dataset_dir, n_jobs=30, excluded_edge_types=frozenset(), data_encoder='new', data_encoder_kwargs=dict(), instance_to_datapoints_kwargs=dict(max_nodes_per_graph=100)) data_encoder = os.path.join(self.train_output_dataset_dir, 'FITBFixedVocabDataEncoder.pkl') preprocess_task_for_model( seed=235, task_class_name='FITBTask', task_filepath=self.test_task_filepath, model_name='FITBFixedVocabGGNN', dataset_output_dir=self.test_output_dataset_dir, n_jobs=30, excluded_edge_types=frozenset(), data_encoder=data_encoder) val_data, train_FITB_eval_accuracy = train( seed=1523, log_dir=self.train_log_dir, gpu_ids=(0, 1, 2), model_name='FITBFixedVocabGGNN', data_encoder_filepath=os.path.join( self.train_output_dataset_dir, '{}.pkl'.format(FITBFixedVocabGGNN.DataEncoder.__name__)), model_kwargs=dict(hidden_size=21, type_emb_size=23, name_emb_size=17, n_msg_pass_iters=2), init_fxn_name='Xavier', init_fxn_kwargs=dict(), loss_fxn_name='FITBLoss', loss_fxn_kwargs=dict(), optimizer_name='Adam', optimizer_kwargs={'learning_rate': .0002}, train_data_directory=self.train_output_dataset_dir, val_fraction=0.15, n_workers=4, n_epochs=2, evaluation_metrics=('evaluate_FITB_accuracy', ), n_batch=63) for f in [ os.path.join(self.train_output_dataset_dir, f) for f in os.listdir(self.train_output_dataset_dir) ]: if f not in val_data and f != os.path.join( self.train_output_dataset_dir, 'FITBFixedVocabDataEncoder.pkl'): os.remove(f) model_checkpoint_path = os.path.join(self.train_log_dir, 'model.pkl') model_params_path = os.path.join(self.train_log_dir, 'model_checkpoint_epoch_1.params') test_FITB_eval_accuracy = evaluate_model( seed=619, log_dir=self.test_log_dir, gpu_ids=(0, 1), model_name='FITBFixedVocabGGNN', model_filepath=model_checkpoint_path, model_params_filepath=model_params_path, test_data_directory=self.train_output_dataset_dir, n_workers=5, n_batch=68, evaluation_metrics=('evaluate_FITB_accuracy', )) self.assertEqual(train_FITB_eval_accuracy, test_FITB_eval_accuracy) model_params_path = os.path.join(self.train_log_dir, 'best.params') test_FITB_eval_accuracy = evaluate_model( seed=214, log_dir=self.test_log_dir, gpu_ids=(0, ), model_name='FITBFixedVocabGGNN', model_filepath=model_checkpoint_path, model_params_filepath=model_params_path, test_data_directory=self.train_output_dataset_dir, n_workers=5, n_batch=55, evaluation_metrics=('evaluate_FITB_accuracy', )) self.assertEqual(train_FITB_eval_accuracy, test_FITB_eval_accuracy)