'TARGET_UPDATE_STEPS': 100, 'LEARNING_RATE': 1e-3, 'REPLAY_BUFFER_SIZE': 1000, 'MIN_REPLAY_BUFFER_SIZE': 100, 'EPSILON_START': 1, 'EPSILON_END': 0.1, 'EPSILON_DECAY_DURATION': 5000, } # Allow changing hyperparameters from command-line arguments args = get_args(default_args=args_dict) # Create wrapped environment env = make_env(args.ENV_ID) # Set Seed set_seed(env, args.SEED) # GPU or CPU device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Create agent agent = Agent(env, device, args) # Train agent for args.NB_FRAMES agent.train() # Save agent agent.save() # Test agent agent.test(render=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--mode", default='transfer', choices=['transfer', 'meta', 'lrcmeta', 'svimeta'], help="Modes.") parser.add_argument("--meta_emb_dim", default=100, type=int, help="Dimensionality of task and language embeddings.") parser.add_argument("--n_samples", default=3, type=int, help="Number of samples in the Bayesian mode.") parser.add_argument( "--scaling", default='uniform', type=str, choices=['uniform', 'linear_annealing', 'logistic_annealing'], help="Scaling for KL term in VI.") parser.add_argument("--max_patience", default=10, type=int, help="Maximum patience for early stopping.") parser.add_argument( "--weight_by_size", action='store_true', help= "Sample task-language example according to data size, weight lr accordingly" ) parser.add_argument( "--num_hidden_layers", default=6, type=int, help="Number of hidden layers for the functions psi and phi") parser.add_argument( "--rank_cov", default=0, type=int, help="Rank of the factored covariance matrix. Diagonal if < 1") parser.add_argument( "--typ_dist", default="src/typ_feats.tab", type=str, help= "File containing pre-computed typological distances between languages") parser.add_argument( "--largest_source", action='store_true', help= "Always choose the source language with the largest number of examples for transfer" ) parser.add_argument( "--model_averaging", action='store_true', help="Predict through model averaging rather than pluggin in the mean") # Experiment parser.add_argument("--model_name_or_path", default="bert-base-multilingual-cased", type=str, help="Path to pre-trained model or shortcut name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--no_train", action='store_true', help="Whether to run training.") parser.add_argument("--no_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--debug", action='store_true', help="Whether to debug gradient flow.") parser.add_argument( "--max_seq_length", default=250, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_proportion", default=0., type=float, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=2500, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=2500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") args = parser.parse_args() if args.largest_source: assert args.mode == "transfer" args.tasks = ['pos', 'ner'] args.languages = sorted([ 'aii', 'am', 'ar', 'bm', 'cy', 'et', 'eu', 'fi', 'fo', 'gl', 'gun', 'he', 'hsb', 'hu', 'hy', 'id', 'kk', 'kmr', 'ko', 'kpv', 'mt', 'myv', 'sme', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'vi', 'wo', 'yo', 'yue' ]) # NER changes zh-yue -> yue, gn -> gun, sme -> sm, arc -> aii, ku -> kmr, kv -> kpv, # find_overlapping_languages(args.tasks) if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and not args.no_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Creating partitions of observed and held-out data for each iteration cartesian_product = list( itertools.product(sorted(args.tasks), sorted(args.languages))) partitions = { ti: sorted([ cartesian_product[pi] for pi in range(ti, len(cartesian_product), len(args.tasks)) ]) for ti in range(len(args.tasks)) } for partition, heldout_pairs in partitions.items(): logger.info("Partition: {}".format(partition)) logger.info("Held-out task-language pairs: {}".format(heldout_pairs)) observed_pairs = sorted( list(set(cartesian_product) - set(heldout_pairs))) lang_nns = find_nearest_languages(heldout_pairs, observed_pairs, args.typ_dist) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab config_class, tokenizer_class, model_class = BertConfig, BertTokenizer, MultiTaskBert # Data and model config = config_class.from_pretrained(args.model_name_or_path) tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) dataloaders, num_batches = load_and_cache_examples( args, tokenizer, observed_pairs if not args.largest_source else [('pos', 'en'), ('ner', 'en_{}'.format(partition))], partition) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, mode=args.mode, languages=args.languages, n_classes={t: len(c) for t, c in CLASSES_PER_TASK.items()}, emb_dim=args.meta_emb_dim, n_samples=args.n_samples, num_hidden_layers=args.num_hidden_layers, rank_cov=args.rank_cov, largest_source=args.largest_source) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if not args.no_train: global_step, tr_loss = train(args, dataloaders, model, tokenizer, num_batches, observed_pairs, heldout_pairs, partition, lang_nns) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) model = model_class.from_pretrained( os.path.join(args.output_dir, 'checkpoint-best-{}'.format(partition)), from_tf=bool('.ckpt' in args.model_name_or_path), config=config, mode=args.mode, languages=args.languages, n_classes={t: len(c) for t, c in CLASSES_PER_TASK.items()}, emb_dim=args.meta_emb_dim, n_samples=args.n_samples, num_hidden_layers=args.num_hidden_layers, rank_cov=args.rank_cov, largest_source=args.largest_source) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory if not args.no_eval and args.local_rank in [-1, 0]: # Evaluate with torch.no_grad(): result = evaluate(args, dataloaders, model, tokenizer, 'test', heldout_pairs, lang_nns, partition, prefix=partition, sample=args.model_averaging) logger.info("Results: {}".format(result)) del model gc.collect() torch.cuda.empty_cache()