Ejemplo n.º 1
0
 def test_add_file(self):
     fn = os.path.join(self.tmpdir, 'log.txt')
     logger.add_file(fn)
     logger.info(self.msg)
     with open(fn, 'r') as f:
         line = ''.join([l for l in f])
         print(line)
     self.assertTrue(self.msg in line)
Ejemplo n.º 2
0
def init_prog(args):
    set_seed(args.seed)

    args.log_path = os.path.join(args.save_dir, "log", args.exp_name)
    args.save_path = os.path.join(args.save_dir, "cp", args.exp_name)
    args.tb_path = os.path.join(args.save_dir, "tb", args.exp_name)
    os.makedirs(args.log_path, exist_ok=True)
    os.makedirs(args.save_path, exist_ok=True)
    os.makedirs(args.tb_path, exist_ok=True)

    if args.evaluate:
        logger.add_file(os.path.join(args.log_path, "eval.log"))
    else:
        logger.add_file(os.path.join(args.log_path, "train.log"))
if args.ff_dropout_2 < 0:
    args.ff_dropout_2 = args.ff_dropout

if over_all_dropout > 0:
    args.embed_dropout = over_all_dropout
    args.output_dropout = over_all_dropout
    args.pre_dropout = over_all_dropout
    args.post_dropout = over_all_dropout
    args.ff_dropout = over_all_dropout
    args.attn_dropout = over_all_dropout

if args.lattice and args.use_rel_pos and args.update_every == 1:
    args.train_clip = True

now_time = get_peking_time()
logger.add_file('log/{}'.format(now_time), level='info')
if args.test_batch == -1:
    args.test_batch = args.batch // 2
fitlog.add_hyper(now_time, 'time')
if args.debug:
    # args.dataset = 'toy'
    pass

if args.device != 'cpu':
    assert args.device.isdigit()
    device = torch.device('cuda:{}'.format(args.device))
else:
    device = torch.device('cpu')

refresh_data = True
Ejemplo n.º 4
0
if over_all_dropout > 0:
    args.embed_dropout = over_all_dropout
    args.output_dropout = over_all_dropout
    args.pre_dropout = over_all_dropout
    args.post_dropout = over_all_dropout
    args.ff_dropout = over_all_dropout
    args.attn_dropout = over_all_dropout

if args.lattice and args.use_rel_pos:
    args.train_clip = True

# fitlog.commit(__file__, fit_msg='绝对位置用新的了')
# fitlog.set_log_dir('../output/logs')
now_time = get_peking_time()
logger.add_file(f'../output/logs/{args.dataset}_{args.status}/bert{args.use_bert}_scheme{args.new_tag_scheme}'
                f'_ple{args.ple_channel_num}_plstm{int(args.use_ple_lstm)}_trainrate{args.train_dataset_rate}/{now_time}.log', level='info')
logger.info('Arguments')
for arg in vars(args):
    logger.info("{}: {}".format(arg, getattr(args, arg)))

# fitlog.add_hyper(now_time, 'time')
if args.debug:
    # args.dataset = 'toy'
    pass

if args.device != 'cpu':
    assert args.device.isdigit()
    device = torch.device('cuda:{}'.format(args.device))
else:
    device = torch.device('cpu')
Ejemplo n.º 5
0
    logger.addHandler(file_handler)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(log_formatter)
    logger.addHandler(console_handler)
    if options.local_rank is None or options.local_rank == 0:
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.WARNING)
    return logger


# ===-----------------------------------------------------------------------===
# Set up logging
# ===-----------------------------------------------------------------------===
# logger = init_logger()
logger.add_file("{}/info.log".format(root_dir), "INFO")
logger.setLevel(logging.INFO if dist.get_rank() == 0 else logging.WARNING)

# ===-----------------------------------------------------------------------===
# Log some stuff about this run
# ===-----------------------------------------------------------------------===
logger.info(" ".join(sys.argv))
logger.info("")
logger.info(options)

if options.debug:
    logger.info("DEBUG MODE")
    options.num_epochs = 2
    options.batch_size = 20

random.seed(options.python_seed)
Ejemplo n.º 6
0
def main():

    parser = argparse.ArgumentParser()
    arg_options.add_path_options(parser)
    arg_options.add_para_options(parser)
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'

    n_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1

    print("num gpus: {}".format(n_gpus))
    is_distributed = n_gpus > 1
    if is_distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group('nccl', init_method='env://')
        args.world_size = dist.get_world_size()
        args.local_rank = int(args.local_rank)
        # synchronize()

    # Setup logging
    log_file_path = os.path.join(
        args.log_output_dir, 'log-{}.txt'.format(
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
    init_logger_dist()
    logger.add_file(log_file_path, level='INFO')
    # logging.basicConfig(
    #     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    #     datefmt="%m/%d/%Y %H:%M:%S",
    #     level=logging.INFO,
    # )
    # logging_fh = logging.FileHandler(log_file_path)
    # logging_fh.setLevel(logging.DEBUG)
    # logger.addHandler(logging_fh)

    args.test_logging_name = log_file_path.split('/')[-1].split(
        '.')[0].replace('log-', '')
    print(log_file_path.split('/')[-1].split('.')[0].replace('log-', ''))

    # cpt prep data load
    if args.local_rank == 0:
        print("Load prep data...\n")
    cpt2words, cpt2id, et2id, sememes, cpt_tree = load_annotated_concepts_new()
    verb_cpts = load_verb_concepts()
    CptEmb = CptEmbedding(sememes, cpt2words, cpt2id, et2id, cpt_tree,
                          args.cpt_max_num, args.random_cpt_num)

    word2cpt_ids = CptEmb.word2cpt_idx
    verb_cpt_ids = [cpt2id[cc] for cc in verb_cpts]
    sememe2id = CptEmb.sememe2id

    cpt_vec = torch.load(
        CptEmb.cpt_vec_in_bert_file
    )[:34442]  # [:10907]   # [:34442]    # 34443 * 768, padding index = 34442
    logger.info("cpt embedding file: {}".format(CptEmb.cpt_vec_in_bert_file))
    logger.info("cpt vec length: {}".format(len(cpt_vec)))

    et2cpts = CptEmb.et2cpts
    cpt2center_sem = CptEmb.cpt2center_sem
    cpt_id2center_sem_id = {
        cpt2id[cc]: sememe2id[sem]
        for cc, sem in cpt2center_sem.items()
    }

    id2cpt = {idx: cc for cc, idx in cpt2id.items()}
    id2et = {id: et for et, id in et2id.items()}
    anno_cpt2et = defaultdict(list)
    et_id2cpt_ids = defaultdict(list)
    for et, cpts in et2cpts.items():
        # print(self.et_id2cpt_ids)
        for cc in cpts:
            anno_cpt2et[cc].append(et)
            et_id2cpt_ids[et2id[et]].append(cpt2id[cc])
    cpt_id2et_id = {
        cpt2id[cc]: [et2id[et] for et in ets]
        for cc, ets in anno_cpt2et.items()
    }

    args.cpt_num = CptEmb.cpt_num
    logger.info("cpt nums: {}\n".format(args.cpt_num))
    logger.info("HowNet words cnt: {}".format(len(word2cpt_ids)))

    # pred DataSet

    train_samples = MyData(args, 'train', args.world_size, args.local_rank)
    dev_samples = MyData(args, 'dev', args.world_size, args.local_rank)
    dev_ace_samples = MyTestData(
        args,
        os.path.join(config.cached_data_dir, "cached_devACE_fixed_samples"),
        args.local_rank)
    logger.info("rank {} / {} load dataset with length: {}.".format(
        args.local_rank, args.world_size, len(train_samples)))
    test_ace_samples = None
    # ************** train data ************************
    train_sampler = DistributedSampler(train_samples,
                                       rank=args.local_rank,
                                       num_replicas=args.world_size)
    train_loader = DataLoader(train_samples,
                              batch_size=args.per_gpu_train_batch_size,
                              pin_memory=True,
                              sampler=train_sampler,
                              num_workers=args.num_workers,
                              collate_fn=train_samples.collate_fn)
    # ************** dev data ************************
    dev_loader = DataLoader(dev_samples,
                            batch_size=args.per_gpu_dev_batch_size,
                            collate_fn=dev_samples.collate_fn)
    dev_ace_loader = DataLoader(dev_ace_samples,
                                batch_size=args.per_gpu_eval_batch_size,
                                collate_fn=dev_ace_samples.collate_fn)
    # ************** test data ************************
    # self.test_loader = DataLoader(test_ace_samples, batch_size=args.per_gpu_eval_batch_size,
    #                               collate_fn=test_ace_samples.collate_fn)

    # ************** init model ***************************
    tokenizer = BertTokenizer.from_pretrained(args.bert_model_dir)
    bert_config = BertConfig.from_pretrained(args.bert_model_dir)
    bert_config.is_decoder = False
    cpt_model = commonCptODEE(args, bert_config, cpt_vec, len(cpt_vec[0]))

    # pred Trainer
    trainer = Trainer(args=args,
                      train_samples=train_loader,
                      dev_samples=dev_loader,
                      dev_ace_samples=dev_ace_loader,
                      test_ace_samples=None,
                      cpt_model=cpt_model,
                      id2cpt=id2cpt,
                      id2et=id2et,
                      cpt_id2et_id=cpt_id2et_id)
    trainer.train()
Ejemplo n.º 7
0
from fastNLP.core.trainer import Trainer
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.embeddings import StaticEmbedding
from reproduction.text_classification.model.dpcnn import DPCNN
from fastNLP.core.sampler import BucketSampler
from fastNLP.core import LRScheduler
from fastNLP.core.const import Const as C
from fastNLP.core.vocabulary import VocabularyOption
from utils.util_init import set_rng_seeds
from fastNLP import logger
import os
from fastNLP.io import YelpFullPipe, YelpPolarityPipe

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# hyper
logger.add_file('log', 'INFO')


class Config():
    seed = 12345
    model_dir_or_name = "dpcnn-yelp-f"
    embedding_grad = True
    train_epoch = 30
    batch_size = 100
    task = "yelp_f"
    #datadir = 'workdir/datasets/SST'
    # datadir = 'workdir/datasets/yelp_polarity'
    datadir = 'workdir/datasets/yelp_full'
    #datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
    datafile = {"train": "train.csv", "test": "test.csv"}
    lr = 1e-3