def main() -> None:
    parser = get_arg_parser()
    args = parser.parse_args()
    device = "cuda" if torch.cuda.is_available() and args.cuda else "cpu"
    print('using device {}'.format(device))

    print('loading vocabulary...')
    if args.small:
        print('using small training set')
        en_vocab = load_vocab(constants.SMALL_TRAIN_EN_VOCAB_FILE)
        fr_vocab = load_vocab(constants.SMALL_TRAIN_FR_VOCAB_FILE)
    else:
        en_vocab = load_vocab(constants.TRAIN_EN_VOCAB_FILE)
        fr_vocab = load_vocab(constants.TRAIN_FR_VOCAB_FILE)
    print('loaded vocabulary')

    print('loading datasets...')
    if args.small:
        train_dataset = d.ShardedCSVDataset(
            constants.WMT14_EN_FR_SMALL_TRAIN_SHARD)
    else:
        train_dataset = d.ShardedCSVDataset(constants.WMT14_EN_FR_TRAIN_SHARD)

    valid_dataset = d.DualFileDataset(
        constants.WMT14_EN_FR_VALID + ".en",
        constants.WMT14_EN_FR_VALID + ".fr",
    )

    train_loader = d.BatchedIterator(
        args.batch_size,
        train_dataset,
        en_vocab,
        fr_vocab,
        args.max_sequence_length,
    )

    valid_loader = d.BatchedIterator(
        1,
        valid_dataset,
        en_vocab,
        fr_vocab,
        args.max_sequence_length,
    )

    model = build_model(parser, en_vocab, fr_vocab)
    model.load_state_dict(torch.load(args.load_path))

    model = model.eval()
    eval_bleu(
        train_loader=train_loader,
        valid_loader=valid_loader,
        model=model,
        en_vocab=en_vocab,
        fr_vocab=fr_vocab,
        device=device,
        multi_gpu=args.multi_gpu,
    )
Beispiel #2
0
def make_pretrain_data(args):
    vocab = load_vocab(args.vocab)
    vocab_list = []
    for id in range(vocab.get_piece_size()):
        if not vocab.is_unknown(id):
            vocab_list.append(vocab.id_to_piece(id))

    line_cnt = 0
    with open(args.input, "r") as in_f:
        for line in in_f:
            line_cnt += 1

    datas = []
    with open(args.input, "r") as f:
        for i, line in enumerate(
                tqdm(f, total=line_cnt, desc="Loading Dataset",
                     unit=" lines")):
            data = json.loads(line)
            if 0 < len(data["doc"]):
                datas.append(data)

    with open(args.output, "w") as out_f:
        for i, data in enumerate(
                tqdm(datas, desc="Make Pretrain Dataset", unit=" lines")):
            instances = create_pretrain_instances(datas, i, data["doc"],
                                                  args.n_seq, args.mask_prob,
                                                  vocab_list)
            for instance in instances:
                out_f.write(json.dumps(instance))
                out_f.write("\n")
def get_datasets(config_data):
    images_root_dir = config_data['dataset']['images_root_dir']
    root_train = os.path.join(images_root_dir, 'train')
    root_val = os.path.join(images_root_dir, 'val')
    root_test = os.path.join(images_root_dir, 'test')

    train_ids_file_path = config_data['dataset']['training_ids_file_path']
    val_ids_file_path = config_data['dataset']['validation_ids_file_path']
    test_ids_file_path = config_data['dataset']['test_ids_file_path']

    train_annotation_file = config_data['dataset'][
        'training_annotation_file_path']
    test_annotation_file = config_data['dataset']['test_annotation_file_path']
    coco = COCO(train_annotation_file)
    coco_test = COCO(test_annotation_file)

    vocab_threshold = config_data['dataset']['vocabulary_threshold']
    vocabulary = load_vocab(train_annotation_file, vocab_threshold)

    train_data_loader = get_coco_dataloader(train_ids_file_path, root_train,
                                            train_annotation_file, coco,
                                            vocabulary, config_data)
    val_data_loader = get_coco_dataloader(val_ids_file_path, root_val,
                                          train_annotation_file, coco,
                                          vocabulary, config_data)
    test_data_loader = get_coco_dataloader(test_ids_file_path, root_test,
                                           test_annotation_file, coco_test,
                                           vocabulary, config_data)

    return coco_test, vocabulary, train_data_loader, val_data_loader, test_data_loader
Beispiel #4
0
def process_caption(cap_dir, vocab_pkl, max_sen_len):
    """
    对caption增加<start>, <end>,截断或填充至固定长度,并将结果保存在{dataset_name}_ids.pkl

    :param max_sen_len:
    :param vocab_pkl:
    :param cap_dir: 划分后的数据集集合根目录,包括train, val, test
    :return:
    """
    vocab = load_vocab(vocab_pkl)

    subsets = ['train', 'val', 'test']
    for subset in subsets:
        ann_file = cap_dir + subset + '_ids.pkl'
        if os.path.exists(ann_file):
            print('*' * 20, os.path.basename(ann_file), 'already exists.',
                  '*' * 20)
            continue

        # {cap_id : {caption: cap , length: len]}
        data_dict = {}
        path = cap_dir + subset + '.json'
        coco = COCO(path)
        ann_ids = list(coco.anns.keys())
        for ann_id in tqdm(ann_ids):
            item_new = collections.OrderedDict()
            cap = coco.anns[ann_id]['caption']
            cap, cap_len = fix_length(cap, vocab, max_sen_len)
            item_new['caption'] = cap
            item_new['length'] = cap_len
            data_dict[ann_id] = item_new

        print('*' * 20, 'save ann_file: ', ann_file, '*' * 20)
        with open(ann_file, 'wb') as f:
            pickle.dump(data_dict, f)
Beispiel #5
0
def make_pretrain_data(args):
    """ pretrain 데이터 생성 """
    if os.path.isfile(args.pretrain):
        print(f"{args.pretrain} exists")
        return

    vocab = load_vocab(args.vocab)
    vocab_list = [vocab.id_to_piece(wid) for wid in range(vocab.get_piece_size()) if not vocab.is_unknown(wid)]

    docs = []
    with open(args.corpus, "r") as f:
        lines = f.read().splitlines()

        doc = []
        for line in tqdm(lines, desc=f"Loading {os.path.basename(args.corpus)}"):
            line = line.strip()
            if line == "":
                if 0 < len(doc):
                    docs.append(doc)
                    doc = []
            else:
                pieces = vocab.encode_as_pieces(line)
                if 0 < len(pieces):
                    doc.append(pieces)
        if 0 < len(doc):
            docs.append(doc)

    with open(args.pretrain, "w") as out_f:
        config = Config.load(args.config)
        for i, doc in enumerate(tqdm(docs, desc=f"Making {os.path.basename(args.pretrain)}", unit=" lines")):
            instances = create_pretrain_instances(docs, i, doc, config.n_enc_seq, config.mask_prob, vocab_list)
            for instance in instances:
                out_f.write(json.dumps(instance, ensure_ascii=False))
                out_f.write("\n")
Beispiel #6
0
def main(args):
  
  vocab = load_vocab()
  
  encoder = CNNEncoder()
  decoder = DecoderRNN(512,512,len(vocab))
  
  encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(args.checkpoint_file,False)
  encoder.load_state_dict(encoder_state_dict)
  decoder.load_state_dict(decoder_state_dict)
  
  if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()
    
  transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])
  
  inp = cv2.imread(args.image_path)
  inp = transform(Image.fromarray(inp)).unsqueeze(0)
  inp = utils.to_var(inp, volatile=True)
  
  features = encoder(inp)
  sampled_ids = decoder.sample(features)
  
  sampled_ids = sampled_ids.cpu().data.numpy()[0]
  sentence = utils.convert_back_to_text(sampled_ids, vocab)
  
  print('Caption:', sentence)
Beispiel #7
0
def train_model(rank, world_size, args):
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master: wandb.init(project="transformer-evolution")

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    model = transformer.QA(config)
    if os.path.isfile(args.save):
        best_epoch, best_loss, best_score = model.load(args.save)
        print(f"rank: {rank} load state dict from: {args.save}")
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)
    else:
        model.to(config.device)
    if master: wandb.watch(model)

    criterion = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=True)
    test_loader, _ = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    offset = best_epoch
    for step in trange(args.epoch, desc="Epoch"):
        if train_sampler:
            train_sampler.set_epoch(step)
        epoch = step + offset

        loss = train_epoch(config, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
        score = eval_epoch(config, rank, model, test_loader)
        if master: wandb.log({"loss": loss, "accuracy": score})

        if master and best_score < score:
            best_epoch, best_loss, best_score = epoch, loss, score
            if isinstance(model, DistributedDataParallel):
                model.module.save(best_epoch, best_loss, best_score, args.save)
            else:
                model.save(best_epoch, best_loss, best_score, args.save)
            print(f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}, socre={best_score:.3f}")

    if 1 < args.n_gpu:
        destroy_process_group()
Beispiel #8
0
def main():
    infer_img_path = "C:/Users/Crypto/PycharmProjects/segmented_style_transfer/data"
    infer_img_name = 'two_boys.jpg'
    # Read and Process Raw data
    data = CaptioningData()
    # Finding image files as data
    data.set_all_images(cfg.images_path)
    captions_dict = data.get_captions(cfg.token_file)
    caption_maxlen = data.get_caption_maxlen()

    vocab = load_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name)
    # print(vocab.word2idx)
    inception_encoding = Encoder()

    # Decoder model
    decoder = Decoder(vocab_size=len(vocab),
                      embedding_size=300,
                      input_shape=2048,
                      caption_max_len=caption_maxlen)
    decoder_model = decoder.get_model()
    decoder_model.load_weights('model/best_weights.97-0.95.hdf5')

    img_ids = data.get_val_images(cfg.val_image_files)
    img_name = img_ids[19]

    enc_img = inception_encoding.encode_single_img(file_path=cfg.images_path,
                                                   img_name=img_name)
    # enc_img = inception_encoding.encode_single_img(file_path=infer_img_path, img_name=infer_img_name)

    caption = ["<start>"]
    while True:
        par_caps = [vocab(i) for i in caption]
        par_caps = sequence.pad_sequences([par_caps],
                                          maxlen=40,
                                          padding='post')
        preds = decoder_model.predict(
            [np.array([enc_img]), np.array(par_caps)])
        word_pred = vocab.idx2word[np.argmax(preds[0])]
        caption.append(word_pred)

        if word_pred == "<end>" or len(caption) > 40:
            break

    full_img_path = os.path.join(cfg.images_path, img_name)
    print(captions_dict[img_name])
    print(full_img_path)
    print(' '.join(caption[1:-1]))

    for beam_size in [3, 5, 7]:
        caption = beam_search_predictions(vocab,
                                          enc_img,
                                          decoder_model,
                                          caption_maxlen,
                                          beam_index=beam_size)
        print(beam_size, caption)
Beispiel #9
0
 def __init__(self, _hparams):
     self.hparams = _hparams
     set_seed(_hparams.fixed_seed)
     self.train_loader = get_dataloader(_hparams.train_src_path, _hparams.train_dst_path,
                                        _hparams.batch_size, _hparams.num_workers)
     self.src_vocab, self.dst_vocab = load_vocab(_hparams.train_src_pkl, _hparams.train_dst_pkl)
     self.device = torch.device(_hparams.device)
     self.model = NMT(_hparams.embed_size, _hparams.hidden_size,
                      self.src_vocab, self.dst_vocab, self.device,
                      _hparams.dropout_rate).to(self.device)
     self.optimizer = torch.optim.Adam(self.model.parameters(), lr=_hparams.lr)
Beispiel #10
0
def main():
    # import ipdb; ipdb.set_trace()
    config = get_args()
    os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(config.gpu_id)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    logger = get_logger(config)
    voc = load_vocab(config)
    config.n_voc = len(voc.voc)
    logger.info(config)
    config.embeddings = voc.embeddings

    config.use_product_info = False
    config.use_user_info = False
    if config.use_user_info and config.use_product_info:
        usrdict = UserTable('../data/' + config.dataname + '/usrlist.txt')
        prddict = ProductTable('../data/' + config.dataname + '/prdlist.txt')
        config.n_users = usrdict.size + 1
        config.n_products = prddict.size + 1
    else:
        usrdict = None
        prddict = None

    logger.info("build model...")
    with tf.device("/device:{}:{}".format(config.device_type, config.gpu_id)):
        model = Model(config)

    logger.info("creating session...")
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=config.gpu_allocate_rate)
    gpu_options.allow_growth = True
    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    gpu_options=gpu_options)
    sess = tf.Session(config=session_config)
    model.init_variable(sess)

    trainset, devset, testset = load_dataset(config, voc, usrdict, prddict)

    if config.load_model:
        logger.info("restoring model...")
        model.restore(sess)

    if not config.test_only:

        logger.info("starting training...")
        model.train(sess, trainset, devset, testset)
        logger.info("training done.")

    logger.info("starting testing...")
    test_acc, test_mae, test_rmse = model.evaluate(sess, testset)
    logger.info(
        "final result of testset: acc = {:.4f}, mae = {:.4f}, rmse = {:.4f}".
        format(test_acc, test_mae, test_rmse))
    logger.info("testing done.")
def save_dataset(dataset, vocab, output,
                 max_length=20):
    sentences=[]
    labels=[]
    with open (dataset) as questions_file:
        
        csv_reader=csv.reader(questions_file, delimiter='\t')
        
        line_count=0
        column_names=[]
        for row in csv_reader:
            
            if line_count==0:
                column_names.append(row[0])
                column_names.append(row[1])
                line_count+=1
            else:
                sentences.append(row[1])
                labels.append(row[2])
                
    total_sentences=len(sentences)
    
    vocab=load_vocab(vocab)
    
    print('Number of sentences to be written: %d',total_sentences)

    h5file = h5py.File(output, "w")
    d_questions = h5file.create_dataset(
        "sentences", (total_sentences, max_length), dtype='i')
 
    d_labels = h5file.create_dataset(
        "labels", (total_sentences,), dtype='i')

    bar = progressbar.ProgressBar(maxval=total_sentences)
    
    q_index = 0
    for sentence,label in zip(sentences,labels):
        
        q, length = process_text(sentence, vocab,
                                 max_length=20)
        d_questions[q_index, :length] = q
        if label == 'yesno' or label == 'factoid' or label == 'list' or label == 'summary':
            x=label
        else:
            print('error')
        l = process_label(label)
        d_labels[q_index] = int(l)
        q_index += 1
        bar.update(q_index)
    h5file.close()
Beispiel #12
0
def make_pretrain_data(args):
    vocab = load_vocab(args.vocab)
    vocab_list = []
    for id in range(vocab.get_piece_size()):
        if not vocab.is_unknown(id):
            vocab_list.append(vocab.id_to_piece(id))

    line_cnt = 0
    with open(args.input, "r") as in_f:
        for line in in_f:
            line_cnt += 1

    docs = []
    with open(args.input, "r") as f:
        doc = []
        for i, line in enumerate(
                tqdm(f,
                     total=line_cnt,
                     desc=f"Loading {args.input}",
                     unit=" lines")):
            line = line.strip()
            if line == "":
                if 0 < len(doc):
                    docs.append(doc)
                    doc = []
            else:
                pieces = vocab.encode_as_pieces(line)
                if 0 < len(pieces):
                    doc.append(pieces)
        if doc:
            docs.append(doc)

    for index in range(args.count):
        output = args.output.format(index)
        if os.path.isfile(output): continue

        with open(output, "w") as out_f:
            for i, doc in enumerate(
                    tqdm(docs, desc=f"Making {output}", unit=" lines")):
                instances = create_pretrain_instances(docs, i, doc, args.n_seq,
                                                      args.mask_prob,
                                                      vocab_list)
                for instance in instances:
                    out_f.write(json.dumps(instance))
                    out_f.write("\n")
    def __init__(self, fea_dim, embed_dim, hid_dim, max_sen_len, vocab_pkl):
        super(RNN, self).__init__()
        self.fea_dim = fea_dim
        self.embed_dim = embed_dim
        self.hid_dim = hid_dim
        self.max_sen_len = max_sen_len
        self.vocab = load_vocab(vocab_pkl)

        self.vocab_size = self.vocab.__len__()
        self.lstm_cell = nn.LSTMCell(self.embed_dim, self.hid_dim)
        self.embed = nn.Embedding(
            self.vocab_size, self.embed_dim)  # num_embeddings, embedding_dim
        self.fc = weight_norm(nn.Linear(self.hid_dim, self.vocab_size))
        self.dropout = nn.Dropout(0.5)
        self.init_h = weight_norm(nn.Linear(self.fea_dim, self.hid_dim))
        self.init_c = weight_norm(nn.Linear(self.fea_dim, self.hid_dim))

        self.init_weight()
Beispiel #14
0
def get_datasets(config_data):

    gen_flag = (config_data['experiment']['num_epochs'] == -1)
    images_root_dir = config_data['dataset']['images_root_dir']
    if not gen_flag:
        root_train = os.path.join(images_root_dir, 'train')
        root_val = os.path.join(images_root_dir, 'val')
        root_test = os.path.join(images_root_dir, 'test')
    else:
        root_img = images_root_dir

    if not gen_flag:
        train_ids_file_path = config_data['dataset']['training_ids_file_path']
        val_ids_file_path = config_data['dataset']['validation_ids_file_path']
        test_ids_file_path = config_data['dataset']['test_ids_file_path']

    train_annotation_file = config_data['dataset']['training_annotation_file_path']
    test_annotation_file = config_data['dataset']['test_annotation_file_path']
    coco = COCO(train_annotation_file)
    coco_test = COCO(test_annotation_file)

    vocab_threshold = config_data['dataset']['vocabulary_threshold']
    vocabulary = load_vocab(train_annotation_file, vocab_threshold)

    if not gen_flag:
        train_data_loader = get_coco_dataloader(train_ids_file_path, root_train, train_annotation_file, coco, vocabulary,
                                                config_data)
        val_data_loader = get_coco_dataloader(val_ids_file_path, root_val, train_annotation_file, coco, vocabulary,
                                              config_data, train=False)
        test_data_loader = get_coco_dataloader(test_ids_file_path, root_test, test_annotation_file, coco_test, vocabulary,
                                               config_data, train=False)
    else:
        train_data_loader, val_data_loader, test_data_loader, coco_test = 0, 0, 0, 0


    return coco_test, vocabulary, train_data_loader, val_data_loader, test_data_loader
Beispiel #15
0
# Get dataset

# Image Preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

# load COCOs dataset
IMAGES_PATH = 'data/train2014'
CAPTION_FILE_PATH = 'data/annotations/captions_train2014.json'

vocab = load_vocab()

train_loader = get_coco_data_loader(path=IMAGES_PATH,
                                    json=CAPTION_FILE_PATH,
                                    vocab=vocab,
                                    transform=transform,
                                    batch_size=batch_size,
                                    shuffle=True)

IMAGES_PATH = 'data/val2014'
CAPTION_FILE_PATH = 'data/annotations/captions_val2014.json'
val_loader = get_coco_data_loader(path=IMAGES_PATH,
                                  json=CAPTION_FILE_PATH,
                                  vocab=vocab,
                                  transform=transform,
                                  batch_size=batch_size,
Beispiel #16
0
def save_dataset(image_dir, questions, annotations, vocab, ans2cat, output,
                 im_size=224, max_q_length=20, max_a_length=4,
                 with_answers=False):
    """Saves the Visual Genome images and the questions in a hdf5 file.

    Args:
        image_dir: Directory with all the images.
        questions: Location of the questions.
        annotations: Location of all the annotations.
        vocab: Location of the vocab file.
        ans2cat: Mapping from answers to category.
        output: Location of the hdf5 file to save to.
        im_size: Size of image.
        max_q_length: Maximum length of the questions.
        max_a_length: Maximum length of the answers.
        with_answers: Whether to also save the answers.
    """
    # Load the data.
    vocab = load_vocab(vocab)
    with open(annotations) as f:
        annos = json.load(f)
    with open(questions) as f:
        questions = json.load(f)
    with open(ans2cat) as f:
        ans2cat = json.load(f)

    # Get the mappings from qid to answers.
    qid2ans, image_ids = create_answer_mapping(annos, ans2cat)
    total_questions = len(qid2ans.keys())
    total_images = len(image_ids)
    print "Number of images to be written: %d" % total_images
    print "Number of QAs to be written: %d" % total_questions

    h5file = h5py.File(output, "w")
    d_questions = h5file.create_dataset(
        "questions", (total_questions, max_q_length), dtype='i')
    d_indices = h5file.create_dataset(
        "image_indices", (total_questions,), dtype='i')
    d_images = h5file.create_dataset(
        "images", (total_images, im_size, im_size, 3), dtype='f')
    d_answers = h5file.create_dataset(
        "answers", (total_questions, max_a_length), dtype='i')
    d_answer_types = h5file.create_dataset(
        "answer_types", (total_questions,), dtype='i')

    # Create the transforms we want to apply to every image.
    transform = transforms.Compose([
        transforms.Resize((im_size, im_size))])

    # Iterate and save all the questions and images.
    bar = progressbar.ProgressBar(maxval=total_questions)
    i_index = 0
    q_index = 0
    done_img2idx = {}
    for entry in questions['questions']:
        image_id = entry['image_id']
        question_id = entry['question_id']
        if image_id not in image_ids:
            continue
        if question_id not in qid2ans:
            continue
        if image_id not in done_img2idx:
            try:
                path = "%d.jpg" % (image_id)
                image = Image.open(os.path.join(image_dir, path)).convert('RGB')
            except IOError:
                path = "%012d.jpg" % (image_id)
                image = Image.open(os.path.join(image_dir, path)).convert('RGB')
            image = transform(image)
            d_images[i_index, :, :, :] = np.array(image)
            done_img2idx[image_id] = i_index
            i_index += 1
        q, length = process_text(entry['question'], vocab,
                                 max_length=max_q_length)
        d_questions[q_index, :length] = q
        answer = qid2ans[question_id]
        a, length = process_text(answer, vocab,
                                 max_length=max_a_length)
        d_answers[q_index, :length] = a
        d_answer_types[q_index] = ans2cat[answer]
        d_indices[q_index] = done_img2idx[image_id]
        q_index += 1
        bar.update(q_index)
    h5file.close()
    print "Number of images written: %d" % i_index
    print "Number of QAs written: %d" % q_index
Beispiel #17
0
def train_model(rank, world_size, args):
    print('dd22')
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab = len(vocab)
    # GPU 사용 여부를 확인합니다.
    config.device = torch.device(
        f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss = 0, 0
    """학습 실행"""
    # BERTPretrain을 생성합니다.
    model = bert.BERTPretrain(config)
    # 기존에 학습된 pretrain 값이 있다면 이를 로드 합니다.
    if os.path.isfile(args.save):
        best_epoch, best_loss = model.bert.load(args.save)
        print(
            f"rank: {rank} load pretrain from: {args.save}, epoch={best_epoch}, loss={best_loss}"
        )
        best_epoch += 1
    # BERTPretrain이 GPU 또는 CPU를 지원하도록 합니다.
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        find_unused_parameters=True)
    else:
        model.to(config.device)

    # MLM loss(criterion_lm) 및 NLP loss(criterion_cls) 함수를 선언 합니다.
    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader = data.build_pretrain_loader(vocab,
                                              args,
                                              epoch=best_epoch,
                                              shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # optimizer를 선언 합니다.
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.learning_rate,
                            eps=args.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    offset = best_epoch
    losses = []
    for step in trange(args.epoch, desc="Epoch"):
        print('step offset')
        print(step)
        print(offset)
        epoch = step + offset
        # 각 epoch 마다 새로 train_loader를 생성 합니다.
        # step이 0인 경우는 위에서 생성했기 때문에 생성하지 않습니다.
        if 0 < step:
            del train_loader
            train_loader = data.build_pretrain_loader(vocab,
                                                      args,
                                                      epoch=epoch,
                                                      shuffle=True)

        # 각 epoch 마다 학습을 합니다.
        loss = train_epoch(config, rank, epoch, model, criterion_lm,
                           criterion_cls, optimizer, scheduler, train_loader)
        losses.append(loss)

        if master:
            best_epoch, best_loss = epoch, loss
            if isinstance(model, DistributedDataParallel):
                model.module.bert.save(best_epoch, best_loss, args.save)
            else:
                model.bert.save(best_epoch, best_loss, args.save)
            print(
                f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}"
            )

    print(f">>>> rank: {rank} losses: {losses}")
    if 1 < args.n_gpu:
        destroy_process_group()
Beispiel #18
0
def train_model(rank, world_size, args):
    """ 모델 학습 """
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master and args.wandb:
        wandb.init(project=args.project)

    vocab = load_vocab(args.vocab)

    config = Config.load(args.config)
    config.n_enc_vocab = len(vocab)
    config.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu"
    print(config)

    best_epoch, best_loss = 0, 0
    train_model = ALBERTPretrain(config)
    if os.path.isfile(args.pretrain_save):
        try:
            best_epoch, best_loss = train_model.albert.load(args.pretrain_save)
            print(
                f"load pretrain from: {os.path.basename(args.pretrain_save)}, epoch={best_epoch}, loss={best_loss:.4f}"
            )
        except:
            print(f'load {os.path.basename(args.pretrain_save)} failed.')

    if 1 < args.n_gpu:
        train_model.to(config.device)
        # noinspection PyArgumentList
        train_model = DistributedDataParallel(train_model,
                                              device_ids=[rank],
                                              find_unused_parameters=True)
    else:
        train_model.to(config.device)

    if master and args.wandb:
        wandb.watch(train_model)

    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader: DataLoader = data.build_pretrain_loader(vocab,
                                                          args,
                                                          shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in train_model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in train_model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=t_total)

    start_epoch = best_epoch + 1
    losses = []
    with trange(args.epoch, desc="Epoch", position=0) as pbar:
        pbar.set_postfix_str(
            f"best epoch: {best_epoch}, loss: {best_loss:.4f}")
        for step in pbar:
            epoch = step + start_epoch

            loss = train_epoch(config, rank, train_model, criterion_lm,
                               criterion_cls, optimizer, scheduler,
                               train_loader)
            losses.append(loss)
            if master and args.wandb:
                wandb.log({"loss": loss})

            if master:
                best_epoch, best_loss = epoch, loss
                if isinstance(train_model, DistributedDataParallel):
                    train_model.module.albert.save(best_epoch, best_loss,
                                                   args.pretrain_save)
                else:
                    train_model.albert.save(best_epoch, best_loss,
                                            args.pretrain_save)

                pbar.set_postfix_str(
                    f"best epoch: {best_epoch}, loss: {best_loss:.4f}")

    if 1 < args.n_gpu:
        destroy_process_group()
Beispiel #19
0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--mode",
        default="prepare",
        type=str,
        required=False,
        help="동작모드 입니다. download: 학습할 데이터 다운로드, prepare: 학습할 데이터셋 생성")
    args = parser.parse_args()

    if not os.path.exists("data"):
        os.makedirs("data")

    if args.mode == "download":
        download_data(args)
    elif args.mode == "prepare":
        vocab = load_vocab("kowiki.model")
        args.corpus = "data/kowiki.txt"
        if not os.path.isfile(args.corpus):
            build_corpus("data/kowiki.csv", args.corpus)
        if not os.path.isfile("data/kowiki.json"):
            prepare_pretrain(args, vocab, "data/kowiki.json")
        if not os.path.isfile("data/ratings_train.json"):
            prepare_train(args, vocab, "data/ratings_train.txt",
                          "data/ratings_train.json")
        if not os.path.isfile("data/ratings_test.json"):
            prepare_train(args, vocab, "data/ratings_test.txt",
                          "data/ratings_test.json")
    else:
        print(
            f"지원하지 않는 모드 입니다. {args.mode}\n- downlaod: 학습할 데이터 다운로드\n- preapre: 학습할 데이터셋 생성"
        )
Beispiel #20
0
 def __init__(self, _hparams):
     self.hparams = _hparams
     self.src_vocab, self.dst_vocab = load_vocab(_hparams.train_src_pkl,
                                                 _hparams.train_dst_pkl)
     self.device = torch.device(_hparams.device)
Beispiel #21
0
def test() -> None:
    en_vocab = load_vocab(TRAIN_EN_VOCAB_FILE)
    fr_vocab = load_vocab(TRAIN_FR_VOCAB_FILE)

    print("English Vocab Size: {} French Vocab Size: {}".format(
        len(en_vocab), len(fr_vocab)))
Beispiel #22
0
def train_model(rank, world_size, args):
    """ 모델 학습 """
    master = (world_size == 0 or rank % world_size == 0)
    if master and args.wandb:
        wandb.init(project=args.project, resume=args.name, tags=args.tags)

    if 1 < args.n_gpu:
        init_process_group(rank, world_size)

    vocab = load_vocab(args.vocab)

    config = Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    model: MovieClassification = transformer.MovieClassification(config)
    if args.resume and os.path.isfile(args.save):
        best_epoch, best_loss, best_score = model.load(args.save)
        print(f"rank: {rank}, last epoch: {best_epoch} load state dict from: {os.path.basename(args.save)}")
    model.to(config.device)

    if master and args.wandb:
        wandb.watch(model)

    if 1 < args.n_gpu:
        model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)

    criterion = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_train.json")), args, shuffle=True)
    test_loader, test_sampler = data.build_data_loader(rank, vocab, os.path.abspath(os.path.join(os.getcwd(), args.data_dir, "ratings_test.json")), args, shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optimization.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = optimization.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, last_epoch=best_epoch)

    print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB')
    with tqdm(initial=best_epoch + 1, total=args.epoch, position=0) as pbar:
        for epoch in range(best_epoch + 1, args.epoch + 1):
            if train_sampler:
                train_sampler.set_epoch(epoch)

            train_loss = train_epoch(args, config, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
            test_loss, test_accuracy = eval_epoch(config, rank, model, test_loader, test_sampler)
            if master and args.wandb:
                wandb.config.update(args)
                wandb.log(row={"train loss": train_loss, "accuracy": test_accuracy}, step=epoch)

            if master:
                if best_score < test_accuracy:
                    best_epoch, best_loss, best_score = epoch, train_loss, test_accuracy
                    pbar.set_description(f'Best (score={best_score:.3f}, epoch={best_epoch})')
                    if isinstance(model, DistributedDataParallel):
                        model.module.save(best_epoch, best_loss, best_score, args.save)
                    else:
                        model.save(best_epoch, best_loss, best_score, args.save)
                else:
                    if best_epoch + 5 < epoch:  # early stop
                        break

            pbar.update()
            break
        print(f'total_memory: {torch.cuda.get_device_properties(rank).total_memory / (1024 * 1024):.3f} MB')

    if master and args.wandb:
        wandb.save(args.name)
    if 1 < args.n_gpu:
        destroy_process_group()
Beispiel #23
0
    parser.add_argument("--optimize_batch_size", type=int, default=256)
    parser.add_argument("--generate_size", type=int, default=8196)
    parser.add_argument("--generate_batch_size", type=int, default=256)
    parser.add_argument("--learning_rate", type=float, default=1e-3)
    parser.add_argument("--kl_div_coef", type=float, default=0.0)

    parser.add_argument("--disable_neptune", action="store_true")

    args = parser.parse_args()

    #args.disable_neptune = True

    device = torch.device(0)
    random.seed(0)

    vocab = load_vocab(name=args.vocab_name)
    obj_func = load_score_func(name=args.objective)

    model = RecurrentNetwork(vocab_size=len(vocab),
                             hidden_size=args.hidden_size,
                             num_layers=args.num_layers)
    model.load_state_dict(torch.load(args.pretrained_model_path))
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheme = HillClimbScheme()

    if not args.disable_neptune:
        neptune.init(project_qualified_name="sungsoo.ahn/real-mol-opt")
        experiment = neptune.create_experiment(name="hill_climb",
                                               params=vars(args))
        result_dir = f"./result/tmp/{experiment.id}"
    else:
Beispiel #24
0
import utils, pickle, vocab
# glove_path = '/home/ouzj01/zhangyc/project/glove/glove.840B.300d.txt'
glove_path = 'data/glove/glove_ori.6B.50d.txt'
# save_path = 'data/glove/glove_multiwoz.840B.300d.txt'
# save_path = 'data/glove/glove_multiwoz.6B.50d.txt'
save_path = 'data/glove/glove_kvret.6B.50d.txt'

vocab = vocab.Vocab(1100)
vocab.load_vocab('data/kvret/vocab')
# vocab.load_vocab('data/MultiWOZ/processed/vocab')
vec_array = []
with open(glove_path, 'r', encoding='UTF-8') as ef:
    with open(save_path, 'w') as f:

        for line in ef.readlines():
            line_sep = line.strip().split(' ')
            word, vec = line_sep[0], line_sep[1:]
            if vocab.has_word(word):
                f.write(line)
ef.close()
Beispiel #25
0
def main() -> None:
    parser = get_arg_parser()
    args = parser.parse_args()
    device = "cuda" if torch.cuda.is_available() and args.cuda else "cpu"
    print('using device {}'.format(device))

    print('loading vocabulary...')
    if args.small:
        print('using small training set')
        en_vocab = load_vocab(constants.SMALL_TRAIN_EN_VOCAB_FILE)
        fr_vocab = load_vocab(constants.SMALL_TRAIN_FR_VOCAB_FILE)
    else:
        en_vocab = load_vocab(constants.TRAIN_EN_VOCAB_FILE)
        fr_vocab = load_vocab(constants.TRAIN_FR_VOCAB_FILE)
    print('loaded vocabulary')

    print('loading datasets...')
    if args.small:
        train_dataset = d.ShardedCSVDataset(
            constants.WMT14_EN_FR_SMALL_TRAIN_SHARD)
    else:
        train_dataset = d.ShardedCSVDataset(constants.WMT14_EN_FR_TRAIN_SHARD)

    # valid_dataset = d.DualFileDataset(
    #     constants.WMT14_EN_FR_VALID + ".en",
    #     constants.WMT14_EN_FR_VALID + ".fr",
    # )

    train_loader = d.BatchedIterator(
        args.batch_size,
        train_dataset,
        en_vocab,
        fr_vocab,
        args.max_sequence_length,
    )

    # valid_loader = d.BatchedIterator(
    #     1,
    #     valid_dataset,
    #     en_vocab,
    #     fr_vocab,
    #     args.max_sequence_length,
    # )

    model = build_model(parser, en_vocab, fr_vocab)

    print('using model...')
    print(model)

    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    if not os.path.exists(os.path.join(args.save_dir, args.model_name)):
        os.makedirs(os.path.join(args.save_dir, args.model_name))

    # model.load_state_dict(torch.load('delete/model_1543183590.2138884/unk_problem.pt'))

    train(
        train_loader=train_loader,
        valid_loader=None,  # valid_loader,
        model=model,
        epochs=args.num_epochs,
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        log_dir=args.log_dir,
        save_dir=args.save_dir,
        en_vocab=en_vocab,
        fr_vocab=fr_vocab,
        device=device,
        multi_gpu=args.multi_gpu,
        save_step=args.save_step,
        model_name=args.model_name,
        optimizer=args.optimizer,
    )
Beispiel #26
0
def main(args):
    # hyperparameters
    batch_size = args.batch_size
    num_workers = 1

    # Image Preprocessing
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    # load COCOs dataset
    IMAGES_PATH = 'data/train2014'
    CAPTION_FILE_PATH = 'data/annotations/captions_train2014.json'

    vocab = load_vocab()
    train_loader = get_coco_data_loader(path=IMAGES_PATH,
                                        json=CAPTION_FILE_PATH,
                                        vocab=vocab,
                                        transform=transform,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        num_workers=num_workers)

    IMAGES_PATH = 'data/val2014'
    CAPTION_FILE_PATH = 'data/annotations/captions_val2014.json'
    val_loader = get_coco_data_loader(path=IMAGES_PATH,
                                      json=CAPTION_FILE_PATH,
                                      vocab=vocab,
                                      transform=transform,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=num_workers)

    losses_val = []
    losses_train = []

    # Build the models
    ngpu = 1
    initial_step = initial_epoch = 0
    embed_size = args.embed_size
    num_hiddens = args.num_hidden
    learning_rate = 5e-4
    num_epochs = 2
    log_step = args.log_step
    save_step = 500
    checkpoint_dir = args.checkpoint_dir

    encoder = CNNEncoder()
    decoder = DecoderRNN(embed_size, num_hiddens, len(vocab))

    # Loss
    criterion = nn.CrossEntropyLoss()

    if args.checkpoint_file:
        encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(
            args.checkpoint_file, args.sample)
        initial_step, initial_epoch, losses_train, losses_val = meta
        encoder.load_state_dict(encoder_state_dict)
        decoder.load_state_dict(decoder_state_dict)
    else:
        params = list(decoder.parameters()) + list(
            encoder.batchnorm.parameters())
        optimizer = torch.optim.Adam(params, lr=learning_rate)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    if args.sample:
        return utils.sample(encoder, decoder, vocab, val_loader)

    # Train the Models
    total_step = len(train_loader)
    try:
        for epoch in range(initial_epoch, num_epochs):

            for step, (images, captions,
                       lengths) in enumerate(train_loader, start=initial_step):

                # Set mini-batch dataset
                images = utils.to_var(images, volatile=True)
                captions = utils.to_var(captions)
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                # Forward, Backward and Optimize
                decoder.zero_grad()
                encoder.zero_grad()
                if ngpu > 1:
                    # run on multiple GPU
                    features = nn.parallel.data_parallel(
                        encoder, images, range(ngpu))
                    outputs, alphas = nn.parallel.data_parallel(
                        decoder, features, range(ngpu))
                else:
                    # run on single GPU
                    features = encoder(images)
                    outputs, alphas = decoder(features, captions, lengths)

                train_loss = criterion(outputs, targets.cpu())
                train_loss += ((1. - alphas.sum(dim=1))**2).mean()
                losses_train.append(train_loss.data)
                train_loss.backward()
                optimizer.step()

                print('Epoch: {} - Step: {} - Train Loss: {}'.format(
                    epoch, step, losses_train[-1]))
                # Run validation set and predict
                if step % log_step == 404:
                    encoder.batchnorm.eval()
                    # run validation set
                    batch_loss_val = []
                    for val_step, (images, captions,
                                   lengths) in enumerate(val_loader):
                        images = utils.to_var(images, volatile=True)
                        captions = utils.to_var(captions, volatile=True)

                        targets = pack_padded_sequence(captions,
                                                       lengths,
                                                       batch_first=True)[0]
                        features = encoder(images)
                        outputs, alphas = decoder(features, captions, lengths)
                        val_loss = criterion(outputs, targets.cpu())
                        val_loss += ((1. - alphas.sum(dim=1))**2).mean()
                        batch_loss_val.append(val_loss.data)
                        if val_step % 50 == 0:
                            print('Epoch: {} - Step: {} - Mini Eval Loss: {}'.
                                  format(epoch, val_step, val_loss))
                            sampled_ids = decoder.sample(features)
                            sampled_ids = sampled_ids.cpu().data.numpy()[0]
                            sentence = utils.convert_back_to_text(
                                sampled_ids, vocab)
                            print('Sample:', sentence)

                            true_ids = captions.cpu().data.numpy()[0]
                            sentence = utils.convert_back_to_text(
                                true_ids, vocab)
                            print('Target:', sentence)

                    losses_val.append(np.mean(batch_loss_val))
                    # predict

                    print('Epoch: {} - Step: {} - Eval Loss: {}'.format(
                        epoch, step, losses_val[-1]))
                    encoder.batchnorm.train()

                # Save the models
                if (step + 1) % save_step == 0:
                    utils.save_models(encoder, decoder, optimizer, step, epoch,
                                      losses_train, losses_val, checkpoint_dir)
                    utils.dump_losses(
                        losses_train, losses_val,
                        os.path.join(checkpoint_dir, 'losses.pkl'))

    except KeyboardInterrupt:
        pass
    finally:
        # Do final save
        utils.save_models(encoder, decoder, optimizer, step, epoch,
                          losses_train, losses_val, checkpoint_dir)
        utils.dump_losses(losses_train, losses_val,
                          os.path.join(checkpoint_dir, 'losses.pkl'))
Beispiel #27
0
def main():
    construct_vocab = False
    encode_images = False
    train = True

    # Read and Process Raw data
    data = CaptioningData()
    # Finding image files as data
    data.set_all_images(cfg.images_path)
    captions_dict = data.get_captions(cfg.token_file)
    caption_maxlen = data.get_caption_maxlen()

    # Construct vocabulary
    if construct_vocab:
        # get all caption to construct Vocab
        all_captions = data.get_all_captions()
        vocab = build_vocab(vocab_path=cfg.data_path,
                            vocab_name=cfg.vocab_name,
                            captions=all_captions,
                            threshold=2)
    else:
        vocab = load_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name)
    # print(vocab.word2idx)
    inception_encoding = Encoder()

    # train data
    if train:
        train_images = data.get_train_images(cfg.train_image_files)
        train_pairs = [
            ImgCaptionPair(img_id, captions_dict[img_id])
            for img_id in train_images
        ]

        # Image Encoding

        if encode_images:
            train_img_encoding = inception_encoding.encode_images(
                file_path=cfg.images_path,
                image_list=train_images,
                encoding_file=cfg.train_img_encoding_file)
        else:
            train_img_encoding = inception_encoding.load_image_encoding(
                encoding_file=cfg.train_img_encoding_file)

        train_data_generator = data_generator(vocab,
                                              train_pairs,
                                              train_img_encoding,
                                              batch_size=1800,
                                              max_len=caption_maxlen)
        # next(g)

    # Decoder model
    decoder = Decoder(vocab_size=len(vocab),
                      embedding_size=300,
                      input_shape=2048,
                      caption_max_len=caption_maxlen)
    decoder_model = decoder.get_model()
    decoder_model.load_weights('best_weights.97-0.95.hdf5')

    if train:
        decoder_model.compile(loss='categorical_crossentropy',
                              optimizer=RMSprop(),
                              metrics=['accuracy'])
        ckpt = ModelCheckpoint('weights.{epoch:02d}-{loss:.2f}.hdf5',
                               monitor='loss',
                               verbose=0,
                               save_best_only=False,
                               save_weights_only=False,
                               mode='auto',
                               period=30)
        best_ckpt = ModelCheckpoint('best_weights.{epoch:02d}-{loss:.2f}.hdf5',
                                    monitor='loss',
                                    verbose=0,
                                    save_best_only=True,
                                    save_weights_only=False,
                                    mode='auto',
                                    period=1)
        decoder_model.fit_generator(train_data_generator,
                                    steps_per_epoch=30,
                                    epochs=100,
                                    callbacks=[ckpt, best_ckpt])

    decoder_model.save('decoder_model.h5')

    img_ids = data.get_val_images(cfg.val_image_files)
    img_name = img_ids[9]

    enc_img = inception_encoding.encode_single_img(file_path=cfg.images_path,
                                                   img_name=img_name)

    caption = ["<start>"]
    while True:
        par_caps = [vocab(i) for i in caption]
        par_caps = sequence.pad_sequences([par_caps],
                                          maxlen=40,
                                          padding='post')
        preds = decoder_model.predict(
            [np.array([enc_img]), np.array(par_caps)])
        word_pred = vocab.idx2word[np.argmax(preds[0])]
        caption.append(word_pred)

        if word_pred == "<end>" or len(caption) > 40:
            break

    full_img_path = os.path.join(cfg.images_path, img_name)
    print(captions_dict[img_name])
    print(full_img_path)
    print(' '.join(caption[1:-1]))
Beispiel #28
0
def train_model(rank, world_size, args):
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(
        f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss = 0, 0
    model = albert.ALBERTPretrain(config)
    if os.path.isfile(args.save):
        model.albert.load(args.save)
        print(f"rank: {rank} load pretrain from: {args.save}")
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        find_unused_parameters=True)
    else:
        model.to(config.device)

    criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_pretrain_loader(vocab,
                                                             args,
                                                             shuffle=True)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=t_total)

    offset = best_epoch
    for step in trange(args.epoch, desc="Epoch"):
        if train_sampler:
            train_sampler.set_epoch(step)
        epoch = step + offset

        loss = train_epoch(config, rank, epoch, model, criterion_lm,
                           criterion_cls, optimizer, scheduler, train_loader)

        if master:
            best_epoch, best_loss = epoch, loss
            if isinstance(model, DistributedDataParallel):
                model.module.albert.save(best_epoch, best_loss, args.save)
            else:
                model.albert.save(best_epoch, best_loss, args.save)
            print(
                f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}"
            )

    if 1 < args.n_gpu:
        destroy_process_group()
Beispiel #29
0

def build_data_loader(vocab, infile, args, shuffle=True):
    """ 데이터 로더 """
    dataset = MovieDataSet(vocab, infile)
    if 1 < args.n_gpu and shuffle:
        sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, sampler=sampler, collate_fn=movie_collate_fn)
    else:
        sampler = None
        loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, sampler=sampler, shuffle=shuffle, collate_fn=movie_collate_fn)
    return loader, sampler


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", default="../data/kowiki.txt", type=str, required=False,
                        help="input text file")
    parser.add_argument("--output", default="../data/kowiki_gpt.json", type=str, required=False,
                        help="output json file")
    parser.add_argument("--n_seq", default=256, type=int, required=False,
                        help="sequence length")
    args = parser.parse_args()

    if not os.path.isfile(args.output):
        vocab = load_vocab("../kowiki.model")
        make_pretrain_data(args, vocab)
    else:
        print(f"{args.output} exists")

Beispiel #30
0
def main(args):
    # hyperparameters
    batch_size = args.batch_size
    num_workers = 2

    # Image Preprocessing
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    vocab = load_vocab()

    loader = get_basic_loader(dir_path=os.path.join(args.image_path),
                              transform=transform,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers)

    # Build the models
    embed_size = args.embed_size
    num_hiddens = args.num_hidden
    checkpoint_path = 'checkpoints'

    encoder = CNN(embed_size)
    decoder = RNN(embed_size,
                  num_hiddens,
                  len(vocab),
                  1,
                  rec_unit=args.rec_unit)

    encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(
        args.checkpoint_file)
    encoder.load_state_dict(encoder_state_dict)
    decoder.load_state_dict(decoder_state_dict)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Train the Models
    try:
        results = []
        with torch.no_grad():
            for step, (images, image_ids) in enumerate(tqdm(loader)):
                images = utils.to_var(images)

                features = encoder(images)
                captions = beam_sample(decoder, features)
                # captions = decoder.sample(features)
                captions = captions.cpu().data.numpy()
                captions = [
                    utils.convert_back_to_text(cap, vocab) for cap in captions
                ]
                captions_formatted = [{
                    'image_id': int(img_id),
                    'caption': cap
                } for img_id, cap in zip(image_ids, captions)]
                results.extend(captions_formatted)
                print('Sample:', captions_formatted[0])
    except KeyboardInterrupt:
        print('Ok bye!')
    finally:
        import json
        file_name = 'captions_model.json'
        with open(file_name, 'w') as f:
            json.dump(results, f)