Exemple #1
0
    def __init__(self, args, dictionary, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        from pytorch_transformers import RobertaModel, BertModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.pretrained_bert_model.startswith('roberta'):
            self.embed = RobertaModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = RobertaModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(
                args.pretrained_bert_model)
            self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        else:
            self.embed = BertModel.from_pretrained(
                args.pretrained_bert_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            # self.context = BertModel.from_pretrained(args.pretrained_bert_model,
            #         cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(
                args.pretrained_bert_model)

            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.padding_idx = self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.pad_token)
def main():

    bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
    bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config)
    count = 0
    for name, param in bert_base_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in bert_base_uncased: ', count)

    roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
    roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config)
    count = 0
    for name, param in roberta_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in roberta: ', count)

    albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2)
    albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config)
    count = 0
    for name, param in albert_model.named_parameters():
        if param.requires_grad:
            size = 1
            for s in param.data.size():
                size = s * size
            count += size
    print('The total number of parameters in albert: ', count)
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

            input_mask = None
            if self.use_input_mask:
                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)

            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = RobertaConfig(
                vocab_size_or_config_json_file=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                hidden_act=self.hidden_act,
                hidden_dropout_prob=self.hidden_dropout_prob,
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range)

            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
Exemple #4
0
from mspan_roberta_gcn.drop_roberta_dataset import DropReader
from pytorch_transformers import RobertaTokenizer, RobertaModel, RobertaConfig


parser = argparse.ArgumentParser("Bert inference task.")
options.add_bert_args(parser)
options.add_model_args(parser)
options.add_inference_args(parser)

args = parser.parse_args()

args.cuda = torch.cuda.device_count() > 0


print("Build bert model.")
bert_model = RobertaModel(RobertaConfig().from_pretrained(args.roberta_model))
print("Build Drop model.")
network = NumericallyAugmentedBertNet(bert_model,
                hidden_size=bert_model.config.hidden_size,
                dropout_prob=0.0,
                use_gcn=args.use_gcn,
                gcn_steps=args.gcn_steps)

if args.cuda: network.cuda()
print("Load from pre path {}.".format(args.pre_path))
network.load_state_dict(torch.load(args.pre_path))

print("Load data from {}.".format(args.inf_path))
tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model)
inf_iter = DropBatchGen(args, tokenizer, DropReader(tokenizer, passage_length_limit=463, question_length_limit=46)._read(args.inf_path))
    def __init__(self, args, dictionary, embed_tokens, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        # from pytorch_transformers import RobertaModel
        from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.roberta_model.startswith('roberta'):
            self.roberta = RobertaCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(args.roberta_model)
            self.tokenizer = RobertaTokenizer.from_pretrained(
                args.roberta_model)
        else:
            self.roberta = BertCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(args.roberta_model)
            self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model)
        self.config.output_attentions = True
        self.roberta.pooler.dense.weight.requires_grad = False
        self.roberta.pooler.dense.bias.requires_grad = False

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx

        # self.embed_tokens = embed_tokens
        # self.embed_scale = math.sqrt(embed_dim)

        self.args = args

        # if args.sentence_transformer_arch == 'fairseq':
        #     self.padding_idx = embed_tokens.padding_idx

        #     self.sent_embed_positions = PositionalEmbedding(
        #         1024, embed_dim, self.padding_idx,
        #         left_pad=False,
        #         learned=args.encoder_learned_pos,
        #     )

        #     self.doc_layers = nn.ModuleList([])
        #     self.doc_layers.extend([
        #         TransformerEncoderLayer(args)
        #         for i in range(args.encoder_layers)
        #     ])
        if args.sentence_transformer_arch == 'bert':
            # from pytorch_transformers import RobertaConfig, RobertaTokenizer

            # self.config = RobertaConfig.from_pretrained(args.roberta_model)
            # self.config.output_attentions = True
            # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            embed_dim = self.config.hidden_size
            print('*** padding idx before ***', embed_tokens.padding_idx)
            self.padding_idx = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.pad_token)
            print('*** padding idx after ***', self.padding_idx)

            # let's assume each document has at most 128-self.padding_idx-1 sentences
            # in case of roberta, it is 126
            self.sent_position_embeddings = nn.Embedding(128, embed_dim)
            if args.encoder_layers:
                self.config.num_hidden_layers = args.encoder_layers
            if args.dropout:
                self.config.hidden_dropout_prob = args.dropout
            if args.attention_dropout:
                self.config.attention_probs_dropout_prob = args.attention_dropout
            if args.attn_type == 'attn_score':
                self.sent_encoder = AttnScoreBertEncoder(self.config)
            elif args.attn_type == 'attn_prob':
                self.sent_encoder = BertEncoder(self.config)
            else:
                raise Exception('--attn-type doesn\'t support {} yet !'.format(
                    args.attn_type))
            self.sent_encoder.apply(self._init_weights)

            print('*** sentence encoder config ***')
            print(self.config)
        else:
            raise Exception(
                '--sentence-transformer-arch doesn\'t support {} yet!'.format(
                    args.sentence_transformer_arch))
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/',
                        type=str,
                        required=False,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--task_name",
                        default='ubuntu',
                        type=str,
                        required=False,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default='/hdd/lujunyu/model/ubuntu_roberta_new/',
                        type=str,
                        required=False,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--init_checkpoint",
                        default='/hdd/lujunyu/model/ubuntu_roberta_new/model.pt',
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")

    ## Other parameters
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--max_seq_length",
                        default=256,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")

    parser.add_argument("--eval_batch_size",
                        default=750,
                        type=int,
                        help="Total batch size for eval.")

    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    bert_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2, type_vocab_size=2)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    test_dataset = UbuntuDatasetForRoberta(
        file_path=os.path.join(args.data_dir, "test.txt"),
        max_seq_length=args.max_seq_length,
        tokenizer=tokenizer
    )
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=args.eval_batch_size,
                                                sampler=SequentialSampler(test_dataset), num_workers=8)

    state_dict = torch.load(args.init_checkpoint, map_location='cpu')
    model = RobertaForSequenceClassification.from_pretrained(args.init_checkpoint, config=bert_config)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)


    logger.info("***** Running testing *****")
    logger.info("  Num examples = %d", len(test_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    f = open(os.path.join(args.output_dir, 'logits_test.txt'), 'w')

    model.eval()
    test_loss = 0
    nb_test_steps, nb_test_examples = 0, 0
    for input_ids, segment_ids, label_ids in tqdm(test_dataloader, desc="Step"):
        input_ids = input_ids.to(device)
        segment_ids = segment_ids.to(device)

        with torch.no_grad():
            tmp_test_loss, logits = model(input_ids, token_type_ids=segment_ids, labels=label_ids)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()

        for logit, label in zip(logits, label_ids):
            logit = '{},{}'.format(logit[0], logit[1])
            f.write('_\t{}\t{}\n'.format(logit, label))

        test_loss += tmp_test_loss.mean().item()

        nb_test_examples += input_ids.size(0)
        nb_test_steps += 1

    f.close()
    test_loss = test_loss / nb_test_steps
    result = evaluate(os.path.join(args.output_dir, 'logits_test.txt'))
    result.update({'test_loss':test_loss})

    output_eval_file = os.path.join(args.output_dir, "results_test.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Test results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
Exemple #7
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--model_type", default='roberta', type=str, required=True,
                        help="Model type selected in the list")
    parser.add_argument("--model_name_or_path", default='roberta-large', type=str, required=True,
                        help="Path to pre-trained model or shortcut name selected in the list: ")
    parser.add_argument("--task_name", default=None, type=str, required=True,
                        help="The name of the task to train.")
    parser.add_argument("--comment", default='', type=str,
                        help="The comment")
    parser.add_argument('--output_dir', type=Path, default="output")
    parser.add_argument("--restore", type=bool, default=True, help="Whether restore from the last checkpoint, is nochenckpoints, start from scartch")

    parser.add_argument("--max_seq_length", type=int, default=256, help="max lenght of token sequence")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", type=bool, default=False,
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--adapter_transformer_layers", default=2, type=int,
                        help="The transformer layers of adapter.")
    parser.add_argument("--adapter_size", default=768, type=int,
                        help="The hidden size of adapter.")
    parser.add_argument("--adapter_list", default="0,11,23", type=str,
                        help="The layer where add an adapter")
    parser.add_argument("--adapter_skip_layers", default=0, type=int,
                        help="The skip_layers of adapter according to bert layers")
    parser.add_argument('--meta_adapter_model', type=str, help='the pretrained adapter model')

    parser.add_argument("--per_gpu_train_batch_size", default=32, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps',type=int,default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=3e-5,type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3, type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=10,
                        help="How often do we snapshot losses, for inclusion in the progress dump? (0 = disable)")
    parser.add_argument('--save_steps', type=int, default=1000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument('--eval_steps', type=int, default=None,
                        help="eval every X updates steps.")
    parser.add_argument('--max_save_checkpoints', type=int, default=500,
                        help="The max amounts of checkpoint saving. Bigger than it will delete the former checkpoints")
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--overwrite_cache', action='store_true',
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")

    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    parser.add_argument('--negative_sample', type=int, default=0, help='how many negative samples to select')

    # args
    args = parser.parse_args()

    args.adapter_list = args.adapter_list.split(',')
    args.adapter_list = [int(i) for i in args.adapter_list]

    name_prefix = 'maxlen-'+str(args.max_seq_length)+'_'+'epoch-'+str(args.num_train_epochs)+'_'+'batch-'+str(args.per_gpu_train_batch_size)+'_'+'lr-'+str(args.learning_rate)+'_'+'warmup-'+str(args.warmup_steps)+'_'+str(args.comment)
    args.my_model_name = args.task_name+'_'+name_prefix
    args.output_dir = os.path.join(args.output_dir, args.my_model_name)

    if args.eval_steps is None:
        args.eval_steps = args.save_steps*2

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # Set seed
    set_seed(args)

    args.output_mode = output_modes[args.task_name]

    processor = processors[args.task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    config = RobertaConfig.from_pretrained('roberta-large', output_attentions=True)

    pretrained_model = PretrainedModel()
    adapter_model = AdapterModel(args, pretrained_model.config)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    pretrained_model.to(args.device)
    adapter_model.to(args.device)
    model = (pretrained_model, adapter_model)

    logger.info("Training/evaluation parameters %s", args)
    val_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'dev', evaluate=True)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, 'train', evaluate=False)

        global_step, tr_loss = train(args, train_dataset, val_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        model_to_save = model.module if hasattr(adapter_model, 'module') else adapter_model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/',
                        type=str,
                        required=False,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--task_name",
                        default='ubuntu',
                        type=str,
                        required=False,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default='/hdd/lujunyu/model/chatbert/check/',
                        type=str,
                        required=False,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--data_augmentation",
                        default=False,
                        action='store_true',
                        help="Whether to use augmentation")
    parser.add_argument("--max_seq_length",
                        default=256,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--train_batch_size",
                        default=400,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=100,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=20.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_steps",
                        default=0.0,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=1e-3,
                        type=float,
                        help="weight_decay")
    parser.add_argument("--save_checkpoints_steps",
                        default=3125,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=5,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
            args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        if args.do_train:
            raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    if args.data_augmentation:
        train_dataset = UbuntuDatasetForRoberta(
            file_path=os.path.join(args.data_dir, "train_augment_ubuntu.txt"),
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer
        )
    else:
        train_dataset = UbuntuDatasetForRoberta(
            file_path=os.path.join(args.data_dir, "train.txt"),
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer
        )
    eval_dataset = UbuntuDatasetForRoberta(
        file_path=os.path.join(args.data_dir, "valid.txt"),  ### TODO:change
        max_seq_length=args.max_seq_length,
        tokenizer=tokenizer
    )

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size,
                                                sampler=RandomSampler(train_dataset), num_workers=8)
    eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=args.eval_batch_size,
                                                sampler=SequentialSampler(eval_dataset), num_workers=8)

    model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=bert_config)
    model.to(device)

    num_train_steps = None
    if args.do_train:
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        # remove pooler, which is not used thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay': args.weight_decay}, {
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0}]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps)
    else:
        optimizer = None
        scheduler = None

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.data)

    global_step = 0
    best_metric = 0.0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, label_ids = batch
                loss, _ = model(input_ids, labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()    # We have accumulated enought gradients
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

                if (step + 1) % args.save_checkpoints_steps == 0:
                    model.eval()
                    f = open(os.path.join(args.output_dir, 'logits_dev.txt'), 'w')
                    eval_loss = 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    logits_all = []
                    for input_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(input_ids, labels=label_ids)

                        logits = logits.detach().cpu().numpy()
                        logits_all.append(logits)
                        label_ids = label_ids.cpu().numpy()

                        for logit, label in zip(logits, label_ids):
                            logit = '{},{}'.format(logit[0], logit[1])
                            f.write('_\t{}\t{}\n'.format(logit, label))

                        eval_loss += tmp_eval_loss.mean().item()

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    f.close()
                    logits_all = np.concatenate(logits_all,axis=0)
                    eval_loss = eval_loss / nb_eval_steps

                    result = evaluate(os.path.join(args.output_dir, 'logits_dev.txt'))
                    result.update({'eval_loss': eval_loss})

                    output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt")
                    with open(output_eval_file, "a") as writer:
                        logger.info("***** Eval results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))

                    ### Save the best checkpoint
                    if best_metric < result['R10@1'] + result['R10@2']:
                        try:  ### Remove 'module' prefix when using DataParallel
                            state_dict = model.module.state_dict()
                        except AttributeError:
                            state_dict = model.state_dict()
                        torch.save(state_dict, os.path.join(args.output_dir, "model.pt"))
                        best_metric = result['R10@1'] + result['R10@2']
                        logger.info('Saving the best model in {}'.format(os.path.join(args.output_dir, "model.pt")))

                        ### visualize bad cases of the best model
                        logger.info('Saving Bad cases...')
                        visualize_bad_cases(
                            logits=logits_all,
                            input_file_path=os.path.join(args.data_dir, 'valid.txt'),
                            output_file_path=os.path.join(args.output_dir, 'valid_bad_cases.txt')
                        )

                    model.train()