Ejemplo n.º 1
0
    def __init__(self, root_dir, ids_file, mode='train', length=None):
        '''
        root_dir:
        ids_file:
        mode:
        length:
        '''
        self.root_dir = root_dir  # папка с данными в формате json файлов (gpt2_1024_data)
        self.tokenizer = add_special_tokens()
        self.pad = self.tokenizer.encode(self.tokenizer.pad_token)
        self.files = np.sort(
            [x for x in os.listdir(root_dir) if x.endswith('.json')])
        self.mode = mode
        with open(ids_file, 'r') as f:
            self.data = json.load(f)
        if mode == 'train':
            self.idxs = self.data['train_ids']
        elif mode == 'valid':
            self.idxs = self.data['valid_ids']
        else:
            self.idxs = self.data['test_ids']

        if length == None:
            self.len = len(self.idxs)
        else:
            self.len = length
Ejemplo n.º 2
0
 def __init__(self, root_dir, ids_file, mode='train', length=None):
     self.root_dir = root_dir
     self.tokenizer = add_special_tokens()
     with open(ids_file, 'r') as f:
         if mode == 'train':
             self.idxs = json.load(f)['train_ids']
         elif mode == 'valid':
             self.idxs = json.load(f)['valid_ids']
         else:
             self.idxs = json.load(f)['test_ids']
     if len == None:
         self.len = len(self.idxs)
     else:
         self.len = length
Ejemplo n.º 3
0
def main(file_names, directory):
    """ Reads txt files, extract articles and summaries, tokenize them and save as json files
        Args:
            file_names: list, all the articles with total no of tokens less than 1024
            directory: string, directory where files in file_names is stored
    """
    tokenizer = add_special_tokens()
    print("Execution Started...")
    train_ids = []
    file_id_map = {}
    i = 0
    for file in file_names:
        file = os.path.join(os.getcwd(), directory, file)
        with open(file, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n\n')
        article, abstract = get_art_abs(lines)
        article, abstract = tokenizer.encode(article), tokenizer.encode(abstract)
        if len(article) > 0 and len(abstract) > 0: #and (len(article) + len(abstract)) <= 1023:
            if len(article) > 923:
                article = article[:923]
            if len(abstract) > 100:
                abstract = abstract[:100]
            train_ids.append(i)
            write_json(i, article, abstract)
            file_id_map[i] = os.path.basename(file).replace('.story', '')
            i += 1
            if i % 100 == 0:
                print(i, " files written")

    x, y = int(len(train_ids) * 0.8), int(len(train_ids) * 0.9)
    valid_ids = train_ids[x:y]
    test_ids = train_ids[y:]
    train_ids = train_ids[:x]
    with open("ids.json", 'w') as f:
        js = dict()
        js['train_ids'] = train_ids
        js['valid_ids'] = valid_ids
        js['test_ids'] = test_ids
        json.dump(js, f)

    # file_id_map maps the json file ids to actual cnn/dm file names ending with ".story"
    print("saving file_id_map...")
    with open("file_id_map.pickle", 'wb') as f:
        pickle.dump(file_id_map, f)
    print("file_id_map saved.")
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lr", default=5e-5, type=float, required=False, help="learning rate")
    parser.add_argument("--seed", default=42, type=int, required=False, help="seed to replicate results")
    parser.add_argument("--num_workers", default=4, type=int, required=False, help="num of cpus available")
    parser.add_argument("--device", default=3, required=False, help="torch.device object")
    parser.add_argument("--output_dir", default='./output', type=str, required=True,
                        help="path to save evaluation results")
    parser.add_argument("--model_dir", default='./weights', type=str, required=True, help="path to save trained model")
    parser.add_argument("--root_dir", default='./CNN-DM/gpt2_1024_data', type=str, help="location of json dataset.")
    parser.add_argument("--ids_file", default='./CNN-DM/ids.json', type=str,
                        help="location of train, valid and test file indexes")

    all_args = parser.parse_args()
    dataset = GPT21024Dataset(all_args.root_dir, all_args.ids_file, mode='test')
    tokenizer = add_special_tokens()
    model = GPT2LMHeadModel.from_pretrained(all_args.model_dir)
    all_args.device = torch.device('cuda:'+str(all_args.device))
    model.to(all_args.device)

    test(all_args, model, tokenizer, dataset)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lr", default=5e-5, type=float, required=False, help="learning rate")
    parser.add_argument("--seed", default=42, type=int, required=False, help="seed to replicate results")
    parser.add_argument("--n_gpu", default=1, type=int, required=False, help="no of gpu available")
    parser.add_argument("--gradient_accumulation_steps", default=4, type=int, required=True,
                        help="gradient_accumulation_steps")
    parser.add_argument("--batch_size", default=1, type=int, required=True, help="batch_size")
    parser.add_argument("--num_workers", default=2, type=int, required=False, help="num of cpus available")
    parser.add_argument("--device", default=0, required=False, help="torch.device object")
    parser.add_argument("--num_train_epochs", default=5, type=int, required=True, help="no of epochs of training")
    parser.add_argument("--output_dir", default='./output', type=str, required=True,
                        help="path to save evaluation results")
    parser.add_argument("--model_dir", default='./weights', type=str, required=True, help="path to save trained model")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="max gradient norm.")
    parser.add_argument("--root_dir", default='./CNN-DM/gpt2_1024_data', type=str, help="location of json dataset.")
    parser.add_argument("--ids_file", default='./CNN-DM/ids.json', type=str,
                        help="location of train, valid and test file indexes")
    all_args = parser.parse_args()

    # загружаем трейновый и валидационный датасеты, текенизатор
    train_data = GPT21024Dataset(all_args.root_dir, all_args.ids_file, mode='train')
    valid_data = GPT21024Dataset(all_args.root_dir, all_args.ids_file, mode='valid', length=500)
    tokenizer = add_special_tokens()
    ignore_idx = tokenizer.pad_token_id

    # загружаем gpt2-small
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))

    if all_args.n_gpu > 1:
        model = SaveModelDataParallel(model, device_ids=[i for i in range(all_args.n_gpu)])
    all_args.device = torch.device('cuda:' + str(all_args.device))
    model.to(all_args.device)

    train(all_args, model, tokenizer, train_data, valid_data, ignore_idx)
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--lr",default=5e-5, type=float, required=True, help="learning rate")
	parser.add_argument("--seed",default=42, type=int, required=False, help="seed to replicate results")
	parser.add_argument("--n_gpu",default=1, type=int, required=False, help="no of gpu available")
	parser.add_argument("--gradient_accumulation_steps",default=32, type=int, required=True, help="gradient_accumulation_steps")
	parser.add_argument("--batch_size",default=1, type=int, required=True, help="batch_size")
	parser.add_argument("--num_workers",default=4, type=int, required=False, help="num of cpus available")
	parser.add_argument("--device",default=torch.device('cpu'), required=False, help="torch.device object")
	parser.add_argument("--num_train_epochs",default=1, type=int, required=True, help="no of epochs of training")
	parser.add_argument("--output_dir",default=./output, type=str, required=True, help="path to save evaluation results")
	parser.add_argument("--model_dir",default=./weights, type=str, required=True, help="path to save trained model")
	parser.add_argument("--fp16",default=True, type=bool, required=False, help="whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
	parser.add_argument("--fp16_opt_level",default='O0', type=str, required=False, help="apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].")
	parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.")
	parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.")
	parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes")
	args = parser.parse_args()

	train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000) #training on only 3000 datasets
	valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500)  #validation on only 500 datasets
	tokenizer = add_special_tokens()
	ignore_idx = tokenizer.pad_token_id
   	model = GPT2LMHeadModel.from_pretrained('gpt2')
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--lr",default=5e-5, type=float, required=True, help="learning rate")
	parser.add_argument("--seed",default=42, type=int, required=False, help="seed to replicate results")
	parser.add_argument("--n_gpu",default=1, type=int, required=False, help="no of gpu available")
	parser.add_argument("--gradient_accumulation_steps",default=32, type=int, required=True, help="gradient_accumulation_steps")
	parser.add_argument("--batch_size",default=1, type=int, required=True, help="batch_size")
	parser.add_argument("--num_workers",default=4, type=int, required=False, help="num of cpus available")
	parser.add_argument("--device",default=torch.device('cuda'), required=False, help="torch.device object")
	parser.add_argument("--num_train_epochs",default=5, type=int, required=True, help="no of epochs of training")
	parser.add_argument("--output_dir",default=./output, type=str, required=True, help="path to save evaluation results")
	parser.add_argument("--model_dir",default=./weights, type=str, required=True, help="path to save trained model")
	parser.add_argument("--fp16",default=True, type=bool, required=False, help="whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
	parser.add_argument("--fp16_opt_level",default='O0', type=str, required=False, help="apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].")
	parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.")
	parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.")
	parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes")
	args = parser.parse_args()

	train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000) #training on only 3000 datasets
	valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500)  #validation on only 500 datasets
	tokenizer = add_special_tokens()
	ignore_idx = tokenizer.pad_token_id
   	model = GPT2LMHeadModel.from_pretrained('gpt2')
    	model.resize_token_embeddings(len(tokenizer))
   	model.to(args.device)

	start = time.time()
	train(args, model, tokenizer, train_data, valid_data, ignore_index)
	print('total time: ', (time.time()-start)/60, " minutes", end='\n\n')

	print('Saving trained model...')
	model_file = os.path.join(args['model_dir'], 'model_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(args['fp16_opt_level'],3000,args['num_train_epochs']))
	config_file = os.path.join(args['model_dir'], 'config_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(args['fp16_opt_level'],3000,args['num_train_epochs']))
	torch.save(model.state_dict(), model_file)
	model.config.to_json_file(config_file)
Ejemplo n.º 8
0
lr = 5e-5
num_train_epochs = 5
max_grad_norm = 1.
txt_gen_len = 100
ppo_config = {'batch_size': 1, 'forward_batch_size': 1}
batch_size = ppo_config['batch_size']

train_dataset = GPT21024Dataset('CNN/gpt2_1024_data',
                                'CNN/ids.json',
                                mode='train',
                                length=3000)
val_dataset = GPT21024Dataset('CNN/gpt2_1024_data',
                              'CNN/ids.json',
                              mode='valid',
                              length=500)
tokenizer = add_special_tokens()
ignore_idx = tokenizer.pad_token_id

train_sampler = RandomSampler(train_dataset)
train_dl = DataLoader(train_dataset,
                      sampler=train_sampler,
                      batch_size=batch_size,
                      num_workers=num_workers)
val_sampler = RandomSampler(val_dataset)
val_dl = DataLoader(val_dataset,
                    sampler=val_sampler,
                    batch_size=batch_size,
                    num_workers=num_workers)

gpt2_model = GPT2HeadWithValueModel.from_pretrained(
    './weights/partial_masked/')
Ejemplo n.º 9
0
def main():
    # used from arguments.py
    args = argparser().parse_args()
    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count(
        ) if torch.cuda.is_available() and not args.no_cuda else 0
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare our task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_path,
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    add_special_tokens(model, tokenizer, processor)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab
    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(tokenizer, "train", args)
        global_step, tr_loss = train(model, tokenizer, train_dataset,
                                     processor, args)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForSequenceClassification.from_pretrained(
            args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    assert not (args.do_test and args.do_eval)
    results = {}
    if (args.do_eval or args.do_test) and args.local_rank in [-1, 0]:
        mode = "dev" if args.do_eval else "test"
        tokenizer = AutoTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate(%s) the following checkpoints: %s", mode,
                    checkpoints)
        for checkpoint in checkpoints:
            logger.info("Checkpoint: %s", checkpoint)
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                "/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelForSequenceClassification.from_pretrained(
                checkpoint)
            model.to(args.device)
            result = evaluate(model,
                              tokenizer,
                              processor,
                              mode,
                              args,
                              prefix=prefix)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    return results