def multi_train(args): init_logger() gpu_number = args.world_size mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing procs = [] for i in range(gpu_number): device_id = i procs.append( mp.Process(target=run, args=( args, device_id, error_queue, ), daemon=True)) procs[i].start() logger.info("Starting process pid: {:d} ".format(procs[i].pid)) error_handler.add_child(procs[i].pid) for p in procs: p.join()
def create_app(): load_dotenv() login_manager = LoginManager() _app = Flask(__name__) config = os.environ['APP_SETTINGS'] _app.config.from_object(config) login_manager.init_app(_app) CORS(_app, supports_credentials=True, origins=os.environ['CORS_ORIGINS']) init_logger(_app) info_log("starting app with config: {}".format(config)) # set up the scheduler for running the invoice generation job # scheduler = APScheduler() # scheduler.api_enabled = True # scheduler.init_app(_app) # scheduler.start() # scheduler.add_job(id='invoice_task_id', # func=generate_invoices_job(_app), # trigger='interval', # seconds=30) # TODO: change this to something less frequent # set up the user loader for flask_login @login_manager.user_loader def load_user(user_id): return User.query.get(user_id) return _app
def save_json(save_path, file_id, samples): init_logger() for i, sample in enumerate(samples): save_ = os.path.join(save_path, "{:s}_{:d}.json".format(file_id, i)) with open(save_, 'w') as file: json.dump(sample, file) logger.info("{:s} saved at {:s}".format(save_, save_path))
def write_mapping(params): init_logger( "/sdc/xli/Datasets/cnn_daily/data_nsp/shard/mapping/mapping.log") paths, save_file = params with open(save_file, 'w') as file: for path in paths: file.write(path + "\n") logger.info("{:d} files has write in mapping file".format(len(paths)))
def shard(self): init_logger() def check_file_exists(root_path): for f in glob.glob(os.path.join(root_path, "*.json")): file_path = pathlib.Path(f) if file_path.exists(): os.unlink(file_path) pairs_train_mapping, pairs_test_mapping = self.args.pairs_train_mapping, self.args.pairs_test_mapping train_files, test_files = map( self.read_mapping, (pairs_train_mapping, pairs_test_mapping)) divided_corpus = {'train': train_files, 'test': test_files} # delete all files under the save_path before write in check_file_exists(self.args.save_path) pool = Pool(mp.cpu_count()) for corpus_type in ['train', 'test']: files = divided_corpus.get(corpus_type) dataset = [] file_no = 0 for d in pool.imap_unordered(self.load_pairs, files): if d is not None: dataset.append(d) if len(dataset) > self.args.shard_size: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format( corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) logger.info( "cd_{:s}_{:d}.json saved at {:s}/{:s}.".format( corpus_type, file_no, self.args.save_path, corpus_type)) file_no += 1 dataset = [] else: continue if len(dataset) > 0: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) file_no += 1 pool.close() pool.join() print("Shard task is finished!")
def shard(self): init_logger("/sdc/xli/Datasets/cnn_daily/data_nsp/logs/shard.log") pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping = \ self.args.pairs_train_mapping, self.args.pairs_test_mapping, self.args.pairs_valid_mapping # train_files, test_files, valid_files = map(self.read_mapping, (pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping)) train_files = self.read_mapping(pairs_train_mapping) test_files = self.read_mapping(pairs_test_mapping) valid_files = self.read_mapping(pairs_valid_mapping) divided_corpus = { 'train': train_files, 'test': test_files, 'valid': valid_files } pool = Pool(mp.cpu_count()) for corpus_type in ['train', 'test', 'valid']: files = divided_corpus.get(corpus_type) dataset = [] file_no = 0 for d in pool.imap_unordered(self.load_pairs, files): if d is not None: dataset.append(d) if len(dataset) >= self.args.shard_size: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format( corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) logger.info("{:s} has saved at {:s}/{:s}".format( pt_file.split("/")[-1], self.args.save_path, corpus_type)) file_no += 1 dataset = [] else: continue if len(dataset) > 0: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) logger.info("{:s} has saved at {:s}/{:s}".format( pt_file.split("/")[-1], self.args.save_path, corpus_type)) file_no += 1 pool.close() pool.join() logger.info("Shard task is finished!")
def delete_tgt(): init_logger() root_path = "/sdc/xli/Datasets/cnn_daily/tgts" for root, dirs, file_list in os.walk(root_path): for file in file_list: file_path = os.path.join(root, file) os.unlink(file_path) logger.info("{:s} deleted from {:s}".format(file, root)) os.removedirs(root_path) logger.info("{:s} dir deleted.".format(root_path))
def check_and_delete(self, path): init_logger() # file_path = pathlib.Path(path) # if file_path.exists(): # os.unlink(file_path) # logger.info("{:s} deleted".format(path)) for f in glob.glob(os.path.join(path, "*.json")): file_path = pathlib.Path(f) if file_path.exists(): os.unlink(file_path) logger.info("{:s} deleted from {:s}".format(f, path))
def save_pair(pairs, coherence, mark, file_id, save_path): init_logger() if len(pairs) > 0: for i, pair in enumerate(pairs): pair_dict = {"pair": pair, "coherence": coherence} save_file = os.path.join( save_path, "{:s}_{:s}_{:d}.json".format(file_id, mark, i)) with open(save_file, 'w') as file: json.dump(pair_dict, file) logger.info("{:s} saved".format(save_file))
def tgt_samples(self, params): """ construct positive tgt sample and negative tgt sample which is a random version of the positive one :param json_file: :return: """ def save_json(save_path, file_id, samples): init_logger() for i, sample in enumerate(samples): save_ = os.path.join(save_path, "{:s}_{:d}.json".format(file_id, i)) with open(save_, 'w') as file: json.dump(sample, file) logger.info("{:s} saved at {:s}".format(save_, save_path)) json_file, save_path = params init_logger() _, tgt = self.load_json(json_file) file_id = json_file.split("/")[-1].split(".")[0] if len(tgt) >= self.args.min_sents_num and len( tgt) <= self.args.max_sents_num: tgt_ = list(tgt) random.seed(66) random.shuffle(tgt_) # make sentence pair and write in a single file positive_sents = tgt positive_pairs = [(positive_sents[i], positive_sents[i + 1]) for i in range(len(positive_sents) - 1)] negative_sents = tgt_ negative_pairs = [(negative_sents[i], negative_sents[i + 1]) for i in range(len(negative_sents) - 1)] positive_samples = [{ "tgt": pair, "coherence": 0 } for pair in positive_pairs] # 0 represents coherent negative_samples = [{ "tgt": pair, "coherence": 1 } for pair in negative_pairs] # 1 represents incoherent save_json(save_path, file_id, positive_samples) save_json(save_path, file_id + "_r", negative_samples)
def _format_to_bert(self, params): init_logger( "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log" ) tokenizer, mapping_file, save_file = params logger.info("Processing {:s}".format(mapping_file)) with open(mapping_file, 'r') as m_file: json_paths = (line.strip() for line in m_file.readlines()) samples = [] for json_file in json_paths: with open(json_file, 'r') as j_file: sample = json.load(j_file) pair = sample['pair'] label = sample['coherence'] try: encode = tokenizer(pair[0], pair[1], return_tensors='pt', is_pretokenized=True) if encode['input_ids'].numel() <= self.args.bert_max_position: sample_dict = { 'input_ids': encode['input_ids'].to('cuda'), 'token_type_ids': encode['token_type_ids'].to('cuda'), 'attention_mask': encode['attention_mask'].to('cuda') } samples.append((sample_dict, label)) else: logger.info("Valid sample length: {}".format( encode['input_ids'].numel())) except ValueError: logger.warning("Value Error! And your data is {}".format(pair)) torch.save(samples, save_file) logger.info("{:s} has converted and saved at {:s}".format( mapping_file, save_file)) del (samples) gc.collect()
def single_train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: # 使用指定的gpu torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) else: checkpoint = None def train_iter_method(): return DataLoaderBert(load_dataset(args, 'train', shuffle=True), args.batch_size, shuffle=True, is_test=False) model = NextSentencePrediction(args, device, checkpoint) optim = build_optim(args, model, checkpoint) logger.info(model) trainer = build_trainer(args, device_id, model, optim) trainer.train(train_iter_method, args.train_steps)
def parse_args(self): """Parses arguments and initializes logger Returns: dict -- config arguments """ args = self.parser.parse_args() if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.best_model_path = os.path.join(args.save_dir, 'best_model') args.last_model_path = os.path.join(args.save_dir, 'last_model') args.vocabulary_file = os.path.join(args.save_dir, 'vocabulary.pickle') args.model_checkpoints_dir = os.path.join(args.save_dir, 'model_ckpts') if not os.path.exists(args.model_checkpoints_dir): os.makedirs(args.model_checkpoints_dir) if args.tensorboard: args.tensorboard_dir = os.path.join(args.save_dir, 'tb_output') if not os.path.exists(args.tensorboard_dir): os.makedirs(args.tensorboard_dir) assert os.path.exists(args.data_dir), f"dataset directory {args.data_dir} not exists !!!" for file in ['train_file', 'dev_file', 'test_file']: if getattr(args, file, None) is not None: setattr(args, file, os.path.join(args.data_dir, getattr(args, file, None))) if getattr(args, 'log_file', None) is not None: args.log_file = os.path.join(args.save_dir, args.log_file) assert not os.path.exists(args.log_file), f"log file {args.log_file} exists !!!" init_logger(root_log_level=getattr(args, 'root_log_level', logging.DEBUG), console_log_level=getattr(args, 'console_log_level', logging.NOTSET), log_file=getattr(args, 'log_file', None), log_file_level=getattr(args, 'log_file_level', logging.NOTSET)) return args
def parse_args(self): """This function parses arguments and initializes logger Returns: dict -- config arguments """ cfg = self.parser.parse_args() init_logger(root_log_level=getattr(cfg, 'root_log_level', logging.DEBUG), console_log_level=getattr(cfg, 'console_log_level', logging.NOTSET), log_file=getattr(cfg, 'log_file', None), log_file_level=getattr(cfg, 'log_file_level', logging.NOTSET)) if not os.path.exists(cfg.save_dir): os.makedirs(cfg.save_dir) if not os.path.exists(cfg.train_model_dir): os.makedirs(cfg.train_model_dir) return cfg
def _format_to_bert_one_sample(self, params): init_logger( "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log" ) tokenizer, file, save_file, sample_type = params with open(file, 'r') as json_file: sample = json.load(json_file) pair, coherence = sample['pair'], sample['coherence'] if isinstance(pair, list) and len(pair) > 0 \ and isinstance(pair[0][0], type(pair[1][0])): os.environ["CUDA_VISIBLE_DEVICES"] = "1" encode = tokenizer(pair[0], pair[1], return_tensors='pt', is_pretrained=True) if encode['input_ids'].numel() <= self.args.bert_max_position: sample_dict = { 'input_ids': encode['input_ids'], 'token_type_ids': encode['token_type_ids'], 'attention_mask': encode['attention_mask'] } sample_tuple = (sample_dict, coherence) torch.save(sample_tuple, save_file) logger.info("{:s} has converted and saved at {:s}".format( file, save_file)) file_name = file.split("/")[-1] dst_file = os.path.join( "/sdc/xli/Datasets/cnn_daily/data_nsp/pts_and_back/processed", "{:s}/{:s}".format(sample_type, file_name)) shutil.move(file, dst_file) logger.info("{:s} has moved to {:s}".format( file_name, dst_file)) gc.collect()
parser.add_argument('-report_every', default=1, type=int) args = parser.parse_args() # gpu_ranks is a list of gpu order args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))] print("args.gpu_ranks: {}".format(args.gpu_ranks)) args.world_size = len(args.gpu_ranks) print("args.world_size: {}".format(args.world_size)) print("args.visible_gpus: {}".format(args.visible_gpus)) """ pytorch 指定gpu训练的方式 - 1. 直接终端中使用: >>> CUDA_VISIBLE_DEVICES=1 python train_scripts.py - 2. python代码中设定(官方建议使用该种方法) >>> import os >>> os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" - 3. use torcu.cuda.set_device() >>> import torch >>> torch.cuda.set_device(gpu_id) """ os.environ['CUDA_VISIBLE_DEVICES'] = args.visible_gpus print("Current id: {}".format(torch.cuda.current_device())) init_logger(args.log_file) device = 'cpu' if args.visible_gpus == "-1" else 'cuda' device_id = 0 if device == 'cuda' else -1 if args.mode == 'train': train(args, device_id)
def save(input): init_logger() with open(self.args.save_file, 'a+') as file: json.dump(input, file) logger.info("")