Ejemplo n.º 1
0
def multi_train(args):
    init_logger()

    gpu_number = args.world_size
    mp = torch.multiprocessing.get_context('spawn')

    # Create a thread to listen for errors in the child processes.
    error_queue = mp.SimpleQueue()
    error_handler = ErrorHandler(error_queue)

    # Train with multiprocessing
    procs = []
    for i in range(gpu_number):
        device_id = i
        procs.append(
            mp.Process(target=run,
                       args=(
                           args,
                           device_id,
                           error_queue,
                       ),
                       daemon=True))
        procs[i].start()
        logger.info("Starting process pid: {:d} ".format(procs[i].pid))
        error_handler.add_child(procs[i].pid)
    for p in procs:
        p.join()
Ejemplo n.º 2
0
def create_app():
  load_dotenv()
  login_manager = LoginManager()
  _app = Flask(__name__)
  config = os.environ['APP_SETTINGS']
  _app.config.from_object(config)
  login_manager.init_app(_app)
  CORS(_app, supports_credentials=True, origins=os.environ['CORS_ORIGINS'])
  init_logger(_app)
  info_log("starting app with config: {}".format(config))

  # set up the scheduler for running the invoice generation job
  # scheduler = APScheduler()
  # scheduler.api_enabled = True
  # scheduler.init_app(_app)
  # scheduler.start()
  # scheduler.add_job(id='invoice_task_id',
  #                   func=generate_invoices_job(_app),
  #                   trigger='interval',
  #                   seconds=30)  # TODO: change this to something less frequent

  # set up the user loader for flask_login
  @login_manager.user_loader
  def load_user(user_id):
    return User.query.get(user_id)

  return _app
Ejemplo n.º 3
0
 def save_json(save_path, file_id, samples):
     init_logger()
     for i, sample in enumerate(samples):
         save_ = os.path.join(save_path,
                              "{:s}_{:d}.json".format(file_id, i))
         with open(save_, 'w') as file:
             json.dump(sample, file)
         logger.info("{:s} saved at {:s}".format(save_, save_path))
Ejemplo n.º 4
0
def write_mapping(params):
    init_logger(
        "/sdc/xli/Datasets/cnn_daily/data_nsp/shard/mapping/mapping.log")
    paths, save_file = params
    with open(save_file, 'w') as file:
        for path in paths:
            file.write(path + "\n")
        logger.info("{:d} files has write in mapping file".format(len(paths)))
Ejemplo n.º 5
0
    def shard(self):
        init_logger()

        def check_file_exists(root_path):
            for f in glob.glob(os.path.join(root_path, "*.json")):
                file_path = pathlib.Path(f)
                if file_path.exists():
                    os.unlink(file_path)

        pairs_train_mapping, pairs_test_mapping = self.args.pairs_train_mapping, self.args.pairs_test_mapping
        train_files, test_files = map(
            self.read_mapping, (pairs_train_mapping, pairs_test_mapping))

        divided_corpus = {'train': train_files, 'test': test_files}

        # delete all files under the save_path before write in
        check_file_exists(self.args.save_path)

        pool = Pool(mp.cpu_count())
        for corpus_type in ['train', 'test']:
            files = divided_corpus.get(corpus_type)
            dataset = []
            file_no = 0
            for d in pool.imap_unordered(self.load_pairs, files):
                if d is not None:
                    dataset.append(d)

                    if len(dataset) > self.args.shard_size:
                        pt_file = os.path.join(
                            self.args.save_path,
                            "{:s}/cd_{:s}_{:d}.json".format(
                                corpus_type, corpus_type, file_no))
                        with open(pt_file, 'w') as save:
                            save.write(json.dumps(dataset))

                        logger.info(
                            "cd_{:s}_{:d}.json saved at {:s}/{:s}.".format(
                                corpus_type, file_no, self.args.save_path,
                                corpus_type))
                        file_no += 1
                        dataset = []

                else:
                    continue

            if len(dataset) > 0:
                pt_file = os.path.join(
                    self.args.save_path,
                    "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type,
                                                    file_no))
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                file_no += 1
        pool.close()
        pool.join()

        print("Shard task is finished!")
Ejemplo n.º 6
0
    def shard(self):
        init_logger("/sdc/xli/Datasets/cnn_daily/data_nsp/logs/shard.log")

        pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping = \
            self.args.pairs_train_mapping, self.args.pairs_test_mapping, self.args.pairs_valid_mapping
        # train_files, test_files, valid_files = map(self.read_mapping, (pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping))
        train_files = self.read_mapping(pairs_train_mapping)
        test_files = self.read_mapping(pairs_test_mapping)
        valid_files = self.read_mapping(pairs_valid_mapping)

        divided_corpus = {
            'train': train_files,
            'test': test_files,
            'valid': valid_files
        }

        pool = Pool(mp.cpu_count())
        for corpus_type in ['train', 'test', 'valid']:
            files = divided_corpus.get(corpus_type)
            dataset = []
            file_no = 0
            for d in pool.imap_unordered(self.load_pairs, files):
                if d is not None:
                    dataset.append(d)

                    if len(dataset) >= self.args.shard_size:
                        pt_file = os.path.join(
                            self.args.save_path,
                            "{:s}/cd_{:s}_{:d}.json".format(
                                corpus_type, corpus_type, file_no))
                        with open(pt_file, 'w') as save:
                            save.write(json.dumps(dataset))

                        logger.info("{:s} has saved at {:s}/{:s}".format(
                            pt_file.split("/")[-1], self.args.save_path,
                            corpus_type))
                        file_no += 1
                        dataset = []

                else:
                    continue

            if len(dataset) > 0:
                pt_file = os.path.join(
                    self.args.save_path,
                    "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type,
                                                    file_no))
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                logger.info("{:s} has saved at {:s}/{:s}".format(
                    pt_file.split("/")[-1], self.args.save_path, corpus_type))
                file_no += 1
        pool.close()
        pool.join()

        logger.info("Shard task is finished!")
Ejemplo n.º 7
0
def delete_tgt():
    init_logger()
    root_path = "/sdc/xli/Datasets/cnn_daily/tgts"
    for root, dirs, file_list in os.walk(root_path):
        for file in file_list:
            file_path = os.path.join(root, file)
            os.unlink(file_path)
            logger.info("{:s} deleted from {:s}".format(file, root))
    os.removedirs(root_path)
    logger.info("{:s} dir deleted.".format(root_path))
Ejemplo n.º 8
0
 def check_and_delete(self, path):
     init_logger()
     # file_path = pathlib.Path(path)
     # if file_path.exists():
     #     os.unlink(file_path)
     #     logger.info("{:s} deleted".format(path))
     for f in glob.glob(os.path.join(path, "*.json")):
         file_path = pathlib.Path(f)
         if file_path.exists():
             os.unlink(file_path)
             logger.info("{:s} deleted from {:s}".format(f, path))
Ejemplo n.º 9
0
 def save_pair(pairs, coherence, mark, file_id, save_path):
     init_logger()
     if len(pairs) > 0:
         for i, pair in enumerate(pairs):
             pair_dict = {"pair": pair, "coherence": coherence}
             save_file = os.path.join(
                 save_path,
                 "{:s}_{:s}_{:d}.json".format(file_id, mark, i))
             with open(save_file, 'w') as file:
                 json.dump(pair_dict, file)
             logger.info("{:s} saved".format(save_file))
Ejemplo n.º 10
0
    def tgt_samples(self, params):
        """
        construct positive tgt sample and negative tgt sample which is a random version of the positive one

        :param json_file:
        :return:
        """
        def save_json(save_path, file_id, samples):
            init_logger()
            for i, sample in enumerate(samples):
                save_ = os.path.join(save_path,
                                     "{:s}_{:d}.json".format(file_id, i))
                with open(save_, 'w') as file:
                    json.dump(sample, file)
                logger.info("{:s} saved at {:s}".format(save_, save_path))

        json_file, save_path = params
        init_logger()
        _, tgt = self.load_json(json_file)

        file_id = json_file.split("/")[-1].split(".")[0]
        if len(tgt) >= self.args.min_sents_num and len(
                tgt) <= self.args.max_sents_num:
            tgt_ = list(tgt)
            random.seed(66)
            random.shuffle(tgt_)

            # make sentence pair and write in a single file
            positive_sents = tgt
            positive_pairs = [(positive_sents[i], positive_sents[i + 1])
                              for i in range(len(positive_sents) - 1)]

            negative_sents = tgt_
            negative_pairs = [(negative_sents[i], negative_sents[i + 1])
                              for i in range(len(negative_sents) - 1)]

            positive_samples = [{
                "tgt": pair,
                "coherence": 0
            } for pair in positive_pairs]  # 0 represents coherent
            negative_samples = [{
                "tgt": pair,
                "coherence": 1
            } for pair in negative_pairs]  # 1 represents incoherent

            save_json(save_path, file_id, positive_samples)
            save_json(save_path, file_id + "_r", negative_samples)
Ejemplo n.º 11
0
    def _format_to_bert(self, params):
        init_logger(
            "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log"
        )
        tokenizer, mapping_file, save_file = params

        logger.info("Processing {:s}".format(mapping_file))
        with open(mapping_file, 'r') as m_file:
            json_paths = (line.strip() for line in m_file.readlines())

        samples = []
        for json_file in json_paths:
            with open(json_file, 'r') as j_file:
                sample = json.load(j_file)
            pair = sample['pair']
            label = sample['coherence']

            try:
                encode = tokenizer(pair[0],
                                   pair[1],
                                   return_tensors='pt',
                                   is_pretokenized=True)

                if encode['input_ids'].numel() <= self.args.bert_max_position:
                    sample_dict = {
                        'input_ids': encode['input_ids'].to('cuda'),
                        'token_type_ids': encode['token_type_ids'].to('cuda'),
                        'attention_mask': encode['attention_mask'].to('cuda')
                    }
                    samples.append((sample_dict, label))
                else:
                    logger.info("Valid sample length: {}".format(
                        encode['input_ids'].numel()))
            except ValueError:
                logger.warning("Value Error! And your data is {}".format(pair))

        torch.save(samples, save_file)
        logger.info("{:s} has converted and saved at {:s}".format(
            mapping_file, save_file))

        del (samples)
        gc.collect()
Ejemplo n.º 12
0
def single_train(args, device_id):
    init_logger(args.log_file)

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        # 使用指定的gpu
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)

    else:
        checkpoint = None

    def train_iter_method():
        return DataLoaderBert(load_dataset(args, 'train', shuffle=True),
                              args.batch_size,
                              shuffle=True,
                              is_test=False)

    model = NextSentencePrediction(args, device, checkpoint)
    optim = build_optim(args, model, checkpoint)

    logger.info(model)

    trainer = build_trainer(args, device_id, model, optim)
    trainer.train(train_iter_method, args.train_steps)
Ejemplo n.º 13
0
    def parse_args(self):
        """Parses arguments and initializes logger

        Returns:
            dict -- config arguments
        """

        args = self.parser.parse_args()

        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)

        args.best_model_path = os.path.join(args.save_dir, 'best_model')
        args.last_model_path = os.path.join(args.save_dir, 'last_model')
        args.vocabulary_file = os.path.join(args.save_dir, 'vocabulary.pickle')
        args.model_checkpoints_dir = os.path.join(args.save_dir, 'model_ckpts')

        if not os.path.exists(args.model_checkpoints_dir):
            os.makedirs(args.model_checkpoints_dir)
        if args.tensorboard:
            args.tensorboard_dir = os.path.join(args.save_dir, 'tb_output')
            if not os.path.exists(args.tensorboard_dir):
                os.makedirs(args.tensorboard_dir)

        assert os.path.exists(args.data_dir), f"dataset directory {args.data_dir} not exists !!!"
        for file in ['train_file', 'dev_file', 'test_file']:
            if getattr(args, file, None) is not None:
                setattr(args, file, os.path.join(args.data_dir, getattr(args, file, None)))

        if getattr(args, 'log_file', None) is not None:
            args.log_file = os.path.join(args.save_dir, args.log_file)
            assert not os.path.exists(args.log_file), f"log file {args.log_file} exists !!!"

        init_logger(root_log_level=getattr(args, 'root_log_level', logging.DEBUG),
                    console_log_level=getattr(args, 'console_log_level', logging.NOTSET),
                    log_file=getattr(args, 'log_file', None),
                    log_file_level=getattr(args, 'log_file_level', logging.NOTSET))

        return args
Ejemplo n.º 14
0
    def parse_args(self):
        """This function parses arguments and initializes logger
        
        Returns:
            dict -- config arguments
        """

        cfg = self.parser.parse_args()
        init_logger(root_log_level=getattr(cfg, 'root_log_level',
                                           logging.DEBUG),
                    console_log_level=getattr(cfg, 'console_log_level',
                                              logging.NOTSET),
                    log_file=getattr(cfg, 'log_file', None),
                    log_file_level=getattr(cfg, 'log_file_level',
                                           logging.NOTSET))

        if not os.path.exists(cfg.save_dir):
            os.makedirs(cfg.save_dir)

        if not os.path.exists(cfg.train_model_dir):
            os.makedirs(cfg.train_model_dir)

        return cfg
Ejemplo n.º 15
0
    def _format_to_bert_one_sample(self, params):
        init_logger(
            "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log"
        )
        tokenizer, file, save_file, sample_type = params
        with open(file, 'r') as json_file:
            sample = json.load(json_file)
        pair, coherence = sample['pair'], sample['coherence']

        if isinstance(pair, list) and len(pair) > 0 \
                and isinstance(pair[0][0], type(pair[1][0])):

            os.environ["CUDA_VISIBLE_DEVICES"] = "1"
            encode = tokenizer(pair[0],
                               pair[1],
                               return_tensors='pt',
                               is_pretrained=True)
            if encode['input_ids'].numel() <= self.args.bert_max_position:
                sample_dict = {
                    'input_ids': encode['input_ids'],
                    'token_type_ids': encode['token_type_ids'],
                    'attention_mask': encode['attention_mask']
                }
                sample_tuple = (sample_dict, coherence)
                torch.save(sample_tuple, save_file)
                logger.info("{:s} has converted and saved at {:s}".format(
                    file, save_file))

                file_name = file.split("/")[-1]
                dst_file = os.path.join(
                    "/sdc/xli/Datasets/cnn_daily/data_nsp/pts_and_back/processed",
                    "{:s}/{:s}".format(sample_type, file_name))
                shutil.move(file, dst_file)
                logger.info("{:s} has moved to {:s}".format(
                    file_name, dst_file))

            gc.collect()
Ejemplo n.º 16
0
    parser.add_argument('-report_every', default=1, type=int)

    args = parser.parse_args()
    # gpu_ranks is a list of gpu order
    args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))]
    print("args.gpu_ranks: {}".format(args.gpu_ranks))
    args.world_size = len(args.gpu_ranks)
    print("args.world_size: {}".format(args.world_size))
    print("args.visible_gpus: {}".format(args.visible_gpus))
    """
    pytorch 指定gpu训练的方式
     - 1. 直接终端中使用:
        >>> CUDA_VISIBLE_DEVICES=1 python train_scripts.py
     - 2. python代码中设定(官方建议使用该种方法)
        >>> import os
        >>> os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
     - 3. use torcu.cuda.set_device()
        >>> import torch
        >>> torch.cuda.set_device(gpu_id)
    """

    os.environ['CUDA_VISIBLE_DEVICES'] = args.visible_gpus
    print("Current id: {}".format(torch.cuda.current_device()))

    init_logger(args.log_file)
    device = 'cpu' if args.visible_gpus == "-1" else 'cuda'
    device_id = 0 if device == 'cuda' else -1

    if args.mode == 'train':
        train(args, device_id)
Ejemplo n.º 17
0
 def save(input):
     init_logger()
     with open(self.args.save_file, 'a+') as file:
         json.dump(input, file)
     logger.info("")