Example #1
0
    def __init__(self, vocab_size, data_dir, mode):
        self.sp = spm.SentencePieceProcessor()
        self.vocab_size = vocab_size

        if mode == 'train':
            self.data_file_path = os.path.join(data_dir,
                                               'train/train_data/train_data')

            def save_and_train_tokenizer(path, *args, **kwargs):
                with open(self.data_file_path) as f:
                    data = f.read().splitlines()
                    corpus = list()
                    for line in data:
                        corpus += line.split("\t")[1:]
                    with open('corpus', 'w') as wf:
                        for text in corpus:
                            wf.write("%s\n" % text)
                templates = '--input={} --model_prefix={} --vocab_size={} --hard_vocab_limit=false'
                spm.SentencePieceTrainer.Train(
                    templates.format('corpus', os.path.join(path, 'SPM'),
                                     self.vocab_size))
                self.sp.Load(os.path.join(path, 'SPM.model'))

            nsml.save('tokenizer', save_fn=save_and_train_tokenizer)

        elif mode == 'test':

            def load_tokenizer(path, *args, **kwargs):
                self.sp.Load(os.path.join(path, 'SPM.model'))

            nsml.load(checkpoint='tokenizer',
                      load_fn=load_tokenizer,
                      session=NSML_SESSION)
Example #2
0
def test(config):
    NSML_SESSEION = 'team_6/19_tcls_qa/80'  # NOTE: need to hard code
    NSML_CHECKPOINT = '13800'  # NOTE: nghhhhed to hard code

    assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit"
    assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit"

    set_global_seed(config.seed_num)

    token_makers = create_by_factory(TokenMakersFactory, config.token)
    tokenizers = token_makers["tokenizers"]
    del token_makers["tokenizers"]

    config.data_reader.tokenizers = tokenizers
    data_reader = create_by_factory(DataReaderFactory, config.data_reader)

    def bind_load_vocabs(config, token_makers):
        CHECKPOINT_FNAME = "checkpoint.bin"

        def load(dir_path):
            checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME)
            checkpoint = torch.load(checkpoint_path)

            vocabs = {}
            token_config = config.token
            for token_name in token_config.names:
                token = getattr(token_config, token_name, {})
                vocab_config = getattr(token, "vocab", {})

                texts = checkpoint["vocab_texts"][token_name]
                if type(vocab_config) != dict:
                    vocab_config = vars(vocab_config)
                vocabs[token_name] = Vocab(token_name,
                                           **vocab_config).from_texts(texts)

            for token_name, token_maker in token_makers.items():
                token_maker.set_vocab(vocabs[token_name])
            return token_makers

        nsml.bind(load=load)

    bind_load_vocabs(config, token_makers)
    nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION)

    # Raw to Tensor Function
    text_handler = TextHandler(token_makers, lazy_indexing=False)
    raw_to_tensor_fn = text_handler.raw_to_tensor_fn(
        data_reader,
        cuda_device=device,
    )

    # Model & Optimizer
    model = create_model(token_makers, ModelFactory, config.model, device)
    trainer = Trainer(model, metric_key="f1")

    if nsml.IS_ON_NSML:
        bind_nsml(model, trainer=trainer, raw_to_tensor_fn=raw_to_tensor_fn)
        if config.nsml.pause:
            nsml.paused(scope=locals())
def _infer(model, root_path, test_loader=None):
    if test_loader is None:
        test_loader = data_loader(root=os.path.join(root_path, 'test_data'),
                                  phase='test')

    ensembles_xy = []
    ensembles_w = []
    for sess, chkp, w in archives:
        nsml.load(checkpoint=chkp, session=sess)

        model.eval()
        outputs = []
        outputs_w = []
        num_data = 0
        for idx, (image, _) in enumerate(test_loader):
            with torch.no_grad():
                locs, scores = model(image.cuda())
                all_images_boxes, all_scores = model.detect_objects(
                    locs, scores)

            for box in all_images_boxes:
                box = box.detach().cpu().numpy()
                box_xy = np.array(
                    [box[0], box[1], box[0] + box[2], box[1] + box[3]],
                    dtype=np.float32)
                outputs.append(box_xy)
            outputs_w.extend(all_scores)
            num_data += len(all_images_boxes)
        ensembles_xy.append(np.array(outputs))
        ensembles_w.append(outputs_w)

    # ensembles_xy = np.mean(ensembles_xy, axis=0)
    ensemble_result = [None] * len(ensembles_xy[0])
    best_w = defaultdict(lambda: 0)
    for xys, ws in zip(ensembles_xy, ensembles_w):
        for i, (xy, w) in enumerate(zip(xys, ws)):
            if best_w[i] > w:
                continue
            ensemble_result[i] = xy
            best_w[i] = w
    ensembles_xy = np.array(ensemble_result)

    print(ensembles_xy.shape)
    assert ensembles_xy.shape[0] == num_data
    assert ensembles_xy.shape[1] == 4

    ensembles = []
    for xy in ensembles_xy:
        box = np.array([xy[0], xy[1], xy[2] - xy[0], xy[3] - xy[1]])
        ensembles.append(box)

    outputs = np.stack(ensembles, axis=0)
    assert outputs.shape[0] == num_data
    assert outputs.shape[1] == 4
    print(outputs.shape)
    return outputs
Example #4
0
def _infer(model, data):
    start = time.time()
    ################################################################################
    print('test preprocessing start!')
    # # data: [a, b, c,...]
    data_bc = []
    bc_func, _ = preprocess_dict["ben_clahe"]

    for d in data:
        d = cv2.resize(d, (704, 544))
        data_bc.append(bc_func(d))
        # del d

    # del data
    ellapsed = time.time() - start
    print('test preprocessing time: %d hours %d minutes %d seconds' %
          (ellapsed // 3600, (ellapsed % 3600) // 60, (ellapsed % 3600) % 60))
    print('test preprocessing ended!')
    del data

    ################################################################################

    # n_ensemble = len(ensemble_checkpoints)
    final = []

    for sess, ckpt, config_path in ensemble_checkpoints:
        config = utils.config.load(config_path)

        model = get_model(config).cuda()
        bind_model(model)

        nsml.load(checkpoint=ckpt, session=sess)

        # data_processed = []
        # _func, _ = preprocess_dict[config.DATA.PREPROCESS]
        # for d in data:
        #     d = cv2.resize(d, (config.DATA.IMG_W, config.DATA.IMG_H))
        #     data_processed.append(_func(d))

        out = run(model, data_bc, config)
        final.append(out)

        del model

    # final = sum(final) / float(n_ensemble)
    final = sum(final)

    final = np.argmax(final, axis=1)
    print(final.shape)
    print(final)

    ellapsed = time.time() - start
    print('Total inference time: %d hours %d minutes %d seconds' %
          (ellapsed // 3600, (ellapsed % 3600) // 60, (ellapsed % 3600) % 60))

    return final
Example #5
0
def model_infer(image_path, args):

    # fix seed for train reproduction
    seed_everything(args.SEED)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    default_transforms = transforms.Compose([transforms.Resize(args.input_size)])
    test_tranfsorms = get_transform(target_size=(args.input_size, args.input_size),
                                        transform_list=args.valid_augments,
                                        augment_ratio=args.augment_ratio,
                                        is_train=False)  
    
    test_dataset = PathDataset(image_paths=image_path, 
                                labels=None, 
                                default_transforms=default_transforms, 
                                transforms=test_tranfsorms, 
                                is_test=True)

    test_loader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True)
        
    total_pred = []

    # ensemble models
    for i, (load_session, load_checkpoint) in enumerate(model_list):
        try:
            nsml.load(checkpoint=load_checkpoint, session=load_session)
            print(f'{i}th model loaded {load_session} {load_checkpoint}')
        except:
            print(f'{i}th model load cancel')
        
        model.to(device)
        model.eval()

        fold_pred = []
        
        # test time augmentation
        for _ in range(args.tta):
            tta_pred = []
            with torch.no_grad():
                for i, images in enumerate(test_loader):
                    output = torch.sigmoid(model(images.to(device))).cpu().detach().numpy()
                    tta_pred.append(output)
            tta_pred = np.concatenate(tta_pred).flatten()
            fold_pred.append(tta_pred) 
        total_pred.append(np.array(fold_pred)) 

    total_pred = np.concatenate(total_pred) 
    total_pred = np.mean(total_pred**args.power, axis=0)

    threshold = 0.5
    total_pred = np.where(np.array(total_pred) >= threshold, 1, 0)
    total_pred = total_pred.astype(np.int64)

    return total_pred
Example #6
0
def _infer(root, phase, model, task):
    # root : csv file path
    # change soon
    print('_infer root - : ', root)
    #  - 14s - loss: 0.1703 - acc: 0.7645
    #Epoch 11/100
    #epoch: 10, train_acc: 0.764603060570993
    #model, fixlen_feature_names_global, item = get_xDeepFM()
    #global fixlen_feature_names_global
    model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(
        root)
    #bind_nsml(model)
    #bind_nsml(model, [], args.task)
    print('--get item finished---')
    checkpoint_session = ['3', 'team_62/airush2/258']
    nsml.load(checkpoint=str(checkpoint_session[0]),
              session=str(checkpoint_session[1]))
    print('-- model_load completed --')

    s = time.time()
    data_1_article_idxs = item['article_id'].tolist()
    li = []

    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)
    print('------------is same image picture? let me check---------------')
    print('article_id : ', '757518f4a3da')
    print('article_if : ', image_feature_dict['757518f4a3da'])
    print('--------------------------------------------------------------')

    item['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')
    test_generator = data_generator_test(item)

    # 맞확
    predicts = model.predict_generator(test_generator,
                                       steps=len(item),
                                       workers=4)
    print(f'y_pred shape : {predicts.shape}')
    print(f'y_pred type : {type(predicts)}')
    print(predicts)
    predicts = predicts.reshape((len(item), ))
    pl = predicts.tolist()
    print(pl[:50])
    print(pl[-50:])
    #print(predicts)
    return predicts
    def self_training(self):
        self.data.prepare()
        dataset = 'train'
        img_list, src_dir_list = self.data.get_unlabeled_data(dataset)
        nsml.load(checkpoint='best', session='Ye-Ji-Kim/spam-3/87')

        for idx, img in enumerate(img_list):
            img = np.array([img])
            predict =self.network.predict(img)
            predict = predict[0]
            if np.max(predict) > 0.90:
                if np.argmax(predict) != 0:
                    self.data.save_unlabeled_data(dataset, src_dir_list[idx], np.argmax(predict))
                    print(f'save in {np.argmax(predict)}')
        self.data.show_data_size(dataset)
Example #8
0
def model_infer(data):

    batch_size = 1
    num_workers = 4
    target_size = (384, 384)

    test_transforms = transforms.Compose([
        transforms.CenterCrop(2000),
        transforms.Resize(target_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    test_dataset = TestDataset(data, test_transforms)
    test_loader = DataLoader(test_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=num_workers,
                             pin_memory=True)

    load_list = (('team012/KHD2019_FUNDUS/307', 'best_acc_4'),
                 ('team012/KHD2019_FUNDUS/321', 'best_acc_4'))

    last_pred = []
    for load_session, load_checkpoint in load_list:
        try:
            nsml.load(checkpoint=load_checkpoint, session=load_session)
        except:
            print('load cancel')
        model.to(device)
        model.eval()

        preds = np.zeros((len(test_loader.dataset), args.num_classes))
        with torch.no_grad():
            for i, image in enumerate(test_loader):
                image = image.to(device)
                output = model(image)  # output shape (batch_num, num_classes)

                preds[i * batch_size:(i + 1) *
                      batch_size] = output.detach().cpu().numpy()

        last_pred.append(preds)
    last_pred = np.mean(last_pred, axis=0)
    print(last_pred.shape)
    predictions = np.argmax(last_pred, axis=1)
    print(predictions)
    return predictions
Example #9
0
def train(experiment_name: str = 'v1', pause: bool = False, mode: str = 'train'):
    config = import_module(f'spam.training.experiments.{experiment_name}').config
    model = config['model'](**config['model_kwargs'])
    bind_model(model)
    if pause:
        nsml.paused(scope=locals())    
    if mode == 'train':
        # nsml.load(checkpoint='last_layer_tuning', session='hi-space/spam-2/14')
        # nsml.load(checkpoint='best', session='hi-space/spam-1/147')
        nsml.load(checkpoint='full_tuning_21', session='hi-space/spam-3/3')
        nsml.save('best')
        print('best model saved')
        # exit()
        
        print('-----------')
        print(config)
        print('-----------')
        model.fit(**config['fit_kwargs'])
Example #10
0
    def infer(test_image_data_path, test_meta_data_path):
        # DONOTCHANGE This Line
        test_meta_data = pd.read_csv(test_meta_data_path, delimiter=',', header=0)
        
        input_size=224 # you can change this according to your model.
        batch_size=200 # you can change this. But when you use 'nsml submit --test' for test infer, there are only 200 number of data.
        device = 0
        
        we = 0.25
        ensemble = [['team_62/airush1/320', '02'],['team_62/airush1/320','12'],['team_62/airush1/320','22'],['team_62/airush1/320','32']]
        #ensemble = [['team_62/airush1/415', '03'],['team_62/airush1/415','13'],['team_62/airush1/415','23'],['team_62/airush1/415','33']]
        predict_list = []
        for i in range(4):

            dataloader = DataLoader(
                            AIRushDataset(test_image_data_path, test_meta_data, label_path=None,
                                        transform=transforms.Compose([transforms.Resize((input_size, input_size)), transforms.RandomRotation(20),transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])),
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=4,
                            pin_memory=True)

            # Let's do ensemble!!!
            nsml.load(checkpoint=str(ensemble[i][1]),session=str(ensemble[i][0]))
            
       
            # model load
            model_nsml.to(device)
            model_nsml.eval()
            predict_output_list = [] 
            with torch.no_grad():
                for batch_idx, image in enumerate(dataloader):
                    image = image.to(device)
                    output = model_nsml(image).double()
                    output_prob = to_np(F.softmax(output, dim=1))
                    predict_output_list.append(output_prob * we)
            predict_output_list = np.concatenate(predict_output_list,axis=0)
            predict_list.append(predict_output_list)
           
       

        predict_vector = np.argmax(np.sum(predict_list,axis=0), axis=1)
       
        return predict_vector # this return type should be a numpy array which has shape of (138343)
Example #11
0
def main():
    seed_everything()

    config = utils.config.load(ensemble_checkpoints[0][2])
    model = get_model(config).cuda()
    bind_model(model)

    args = get_args()
    if args.pause:  ## test mode일 때
        print('Inferring Start...')
        nsml.paused(scope=locals())

    if args.mode == 'train':  ### training mode일 때
        print('Training Start...')

        nsml.load(session=ensemble_checkpoints[0][0],
                  checkpoint=ensemble_checkpoints[0][1])
        nsml.save(0)
        exit()
Example #12
0
def main():
    seed_everything()

    pprint.pprint(config, indent=2)

    model = get_model(config).cuda()
    bind_model(model)

    args = get_args()
    if args.pause:  ## test mode일 때
        print('Inferring Start...')
        nsml.paused(scope=locals())

    if args.mode == 'train':  ### training mode일 때
        print('Training Start...')

        nsml.load(checkpoint='18', session='team146/KHD2019_FUNDUS/20')
        nsml.save(0)
        exit()
Example #13
0
    def fit(self, epochs_finetune, epochs_full, batch_size, debug=False):
        sessionName = 'qkek984/spam-3/59'
        nsml.load(checkpoint='best', session=sessionName)
        print(sessionName, "model load!")
        #nsml.save(checkpoint='saved')
        #exit()
        self.debug = debug
        self.data.prepare(unlabeledset=True)
        print("lenunlabeled : ", self.data.lenUnlabeled('unlabeled'))  # check unlabeldata

        self.network.compile(
            loss=self.loss(),
            optimizer=self.optimizer('full'),
            metrics=self.fit_metrics()
        )

        val_gen = self.data.ST_val_gen(batch_size)

        self.myMetrics(val_gen=val_gen, batch_size=batch_size)  # do self training

        return self.data.base_dir
Example #14
0
def _infer(model, root_path, test_loader=None):
    if test_loader is None:
        test_loader = data_loader(root=os.path.join(root_path, 'test_data'),
                                  phase='test')

    res_fcs = []
    for sess, chkp, w in archives:
        nsml.load(checkpoint=chkp, session=sess)

        model.eval()
        res_fc = None
        res_id = None
        for idx, (data_id, image, _) in enumerate(tqdm(test_loader)):
            image = image.cuda()
            with torch.no_grad():
                fc = model(image)
            fc = fc.detach().cpu().numpy()
            fc = np_softmax(fc)

            # with torch.no_grad():
            #     fc2 = model(torch.flip(image, (3, )))       # TTA : horizontal flip
            # fc2 = fc2.detach().cpu().numpy()
            # fc2 = np_softmax(fc2)
            # fc = fc + fc2

            if C.get()['infer_mode'] == 'face':
                fc[:, range(60)] = -1
                # target_lb = list(range(60, 100))

            if idx == 0:
                res_fc = fc
                res_id = data_id
            else:
                res_fc = np.concatenate((res_fc, fc), axis=0)
                res_id = res_id + data_id
        res_fcs.append(res_fc * w)

    res_cls = np.argmax(np.sum(res_fcs, axis=0), axis=1)

    return [res_id, res_cls]
Example #15
0
    def load_finetuned(self):
        for model, sess_ in zip(self.models, self.session):
            print(sess_)
            if model:
                bind_model(model)
                nsml.load(checkpoint='best', session=sess_)
                model = model.cuda()

        if self.mode == 'xgb':
            for model in self.models:
                if model:
                    for name, param in model.named_parameters():
                        param.requires_grad = False
        else:
            for model in self.models:
                for name, param in model.named_parameters():
                    if 'fc' in name and self.mode == 'soft':
                        param.requires_grad = True
                    else:
                        param.requires_grad = False

        print("Pretrained weight loaded! ")
        bind_ensemble_model(self)
Example #16
0
    # opt = optimizers.SGD(lr=learning_rate, momentum=0.9, nesterov=True)
    # opt = optimizers.adamax(lr=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

    bind_model(model)
    if config.pause:  ## test mode일 때
        print('Inferring Start...')
        nsml.paused(scope=locals())

    if config.mode == 'train':  ### training mode일 때
        print('Training Start...')

        img_path = DATASET_PATH + '/train/'

        if config.load_model:
            nsml.load(checkpoint=config.load_model_ckpt, session=config.load_model)

        if nb_epoch == 0:
            nsml.save("zero")
            exit()

        if config.load_from:
            # Load From Saved Session
            data = {}
            def nsml_load(dir_path, **kwargs):
                images = np.load(os.path.join(dir_path, 'data_x.npy'))
                labels = np.load(os.path.join(dir_path, 'data_y.npy'))
                data['x'] = images
                data['y'] = labels
                print("Data Loaded!!!")
            nsml.load(checkpoint='data', load_fn=nsml_load, session=config.load_from)
Example #17
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help="If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help="If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
             "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
             "be truncated to this length.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", default=True,
        action="store_true", help="Run evaluation during training at each logging step."
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=24, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
             "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help="If true, all of the warnings related to data processing will be printed. "
             "A number of warnings are expected for a normal SQuAD evaluation.",
    )

    parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
             "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")

    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")

    ### DO NOT MODIFY THIS BLOCK ###
    # arguments for nsml
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--mode', type=str, default='train')
    ################################

    args = parser.parse_args()

    # for NSML
    args.data_dir = os.path.join(DATASET_PATH, args.data_dir)

    if (
            os.path.exists(args.output_dir)
            and os.listdir(args.output_dir)
            and args.do_train
            and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename='log.log'
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")
    model_SC = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator")
    model_QA = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-small-v3-discriminator")


    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model_SC.to(args.device)
    model_QA.to(args.device)

    ### DO NOT MODIFY THIS BLOCK ###
    if IS_ON_NSML:
        bind_nsml(model_SC, model_QA, tokenizer, args)
        if args.pause:
            nsml.paused(scope=locals())
    ################################
    
    #Before loading, save models using 'run_save_model.py' to gather models in separate sessions.
    nsml.load(checkpoint='saved', session="kaist006/korquad-open-ldbd3/160")
    nsml.save('best_model')

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is
    # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running
    # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

    if args.do_eval:
        result = evaluate(args, model_SC, model_QA, tokenizer)
        _f1, _exact = result["f1"], result["exact"]
        print('f1: {}, exact: {}'.format(_f1, _exact))
    # model setting ## 반드시 이 위치에서 로드해야함

    model1 = build_xception()
    model2 = build_xception()
    model3 = build_xception2()

    # Loss and optimizer
    '''
    model.compile(tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['accuracy', recall, precision, f1, sp, ntv, custom])
    '''

    ############ DONOTCHANGE ###############
    bind_model(model1)
    nsml.load(checkpoint='19', session='KHD032/Breast_Pathology/392')
    bind_model(model2)
    nsml.load(checkpoint='29', session='KHD032/Breast_Pathology/336')
    bind_model(model3)
    nsml.load(checkpoint='62', session='KHD032/Breast_Pathology/223')

    alpha = 1.2

    input_ = tf.keras.Input(shape=(299, 299, 3))
    m1 = model1(input_)
    m2 = model2(input_)
    m3 = model3(input_)
    m3_out = tf.keras.layers.concatenate([1 - m3, m3 + alpha])
    #m3_out = tf.keras.layers.concatenate([1 - m3, m3])
    out = tf.keras.layers.add([m1, m2, m3_out])
    #out = tf.keras.layers.add([m1, m2, m3])
Example #19
0
        img_list = pickle.load(img_f)
    with open(output_path[1], 'rb') as label_f:
        label_list = pickle.load(label_f)

    mean_arr = None  # np.zeros(input_shape)
    #for img in img_list:
    #    mean_arr += img.astype('float32')
    #mean_arr /= len(img_list)
    #print('mean shape:',mean_arr.shape, 'mean mean:',mean_arr.mean(), 'mean max:',mean_arr.max())
    #mean_arr /= 255
    #np.save('./mean.npy', mean_arr)

    if config.pause:
        nsml.paused(scope=locals())

    bTrainmode = False
    if config.mode == 'train':
        bTrainmode = True
        #nsml.load(checkpoint='86', session='Zonber/ir_ph1_v2/204') #Nasnet Large 222
        nsml.load(checkpoint='0',
                  session='Zonber/ir_ph2/222')  #InceptionResnetV2 222
        print('convert start model')
        intermediate_layer_model = Model(
            inputs=model.input[0],
            outputs=model.get_layer('triplet_loss_layer').input[0])
        model_r = reduce_keras_model(intermediate_layer_model)
        model_r.summary()
        print('convert complete reduce model')
        bind_model(model_r)
        print('binde reduce model complete')
        nsml.save(0)  # this is display model name at lb
Example #20
0
    def infer(test_image_data_path, test_meta_data_path):
        # DONOTCHANGE This Line
        test_meta_data = pd.read_csv(test_meta_data_path,
                                     delimiter=',',
                                     header=0)

        device = 0

        models = args.models.split(",")
        model_weights = [float(w) for w in args.model_weights.split(",")]
        nsml_sessionss = args.nsml_sessionss.split(",")
        nsml_checkpoints = args.nsml_checkpoints.split(",")
        loss_types = args.loss_types.split(",")

        transform_random_crop = args.transform_random_crop.split(",")
        transform_random_sized_crop = args.transform_random_sized_crop.split(
            ",")
        transform_norm = args.transform_norm.split(",")
        infer_transform_center_crop = args.infer_transform_center_crop.split(
            ",")

        total_output_probs = None
        for i, model_name in enumerate(models):
            batch_size = batch_size_map[model_name] // 2

            infer_transform_list = []

            if infer_transform_center_crop[i] == "True":
                infer_transform_list.append(transforms.Resize((248, 248)))
                infer_transform_list.append(
                    transforms.CenterCrop((args.input_size, args.input_size)))
                infer_transform_list.append(transforms.ToTensor())
                if transform_norm[i] == "True":
                    infer_transform_list.append(
                        transforms.Normalize(
                            [0.44097832, 0.44847423, 0.42528335],
                            [0.25748107, 0.26744914, 0.30532702]))
            else:
                if transform_random_crop[i] == "True":
                    infer_transform_list.append(transforms.Resize((256, 256)))
                    infer_transform_list.append(
                        transforms.CenterCrop(
                            (args.input_size, args.input_size)))
                elif transform_random_sized_crop[i] == "True":
                    infer_transform_list.append(transforms.Resize((256, 256)))
                    infer_transform_list.append(
                        transforms.CenterCrop(
                            (args.input_size, args.input_size)))
                else:
                    infer_transform_list.append(
                        transforms.Resize((args.input_size, args.input_size)))
                infer_transform_list.append(transforms.ToTensor())
                if transform_norm[i] == "True":
                    infer_transform_list.append(
                        transforms.Normalize(
                            [0.44097832, 0.44847423, 0.42528335],
                            [0.25748107, 0.26744914, 0.30532702]))

            print("transform", infer_transform_list)

            dataloader = DataLoader(
                AIRushDataset(
                    test_image_data_path,
                    test_meta_data,
                    label_path=None,
                    transform=transforms.Compose(infer_transform_list)
                ),  #[transforms.Resize((args.input_size, args.input_size)), transforms.ToTensor()])),
                batch_size=batch_size,
                shuffle=False,
                num_workers=0,
                pin_memory=True)

            if model_name == "Resnet18":
                model = Resnet18(args.output_size)
            elif model_name == "Resnet152":
                model = Resnet152(args.output_size)
            elif model_name == "baseline":
                model = Baseline(args.hidden_size, args.output_size)
            elif model_name.split("-")[0] == "efficientnet":
                model = EfficientNet.from_pretrained(args.model,
                                                     args.output_size)
            else:
                raise Exception("model type is invalid : " + args.model)

            model.to(device)

            def load_fn(dir_name):
                save_state_path = os.path.join(dir_name, 'state_dict.pkl')
                state = torch.load(save_state_path)
                model.load_state_dict(state['model'])
                print("model loaded", dir_name)

            model.eval()

            nsml.load(checkpoint=nsml_checkpoints[i],
                      load_fn=load_fn,
                      session="team_13/airush1/" + nsml_sessionss[i])

            output_probs = None
            for batch_idx, image in enumerate(dataloader):
                image = image.to(device)
                output = model(image).double()

                if loss_types[i] == "cross_entropy":
                    output_prob = F.softmax(output, dim=1)
                else:
                    output_prob = torch.sigmoid(output)

                if output_probs is None:
                    output_probs = to_np(output_prob)
                else:
                    output_probs = np.concatenate(
                        [output_probs, to_np(output_prob)], axis=0)
            if total_output_probs is None:
                total_output_probs = output_probs * model_weights[i]
            else:
                total_output_probs += (output_probs * model_weights[i])

        predict = np.argmax(total_output_probs, axis=1)

        return predict  # this return type should be a numpy array which has shape of (138343)
Example #21
0
        default='0',
        help=
        'fork 명령어를 입력할때의 체크포인트로 설정됩니다. 체크포인트 옵션을 안주면 마지막 wall time 의 model 을 가져옵니다.'
    )
    args.add_argument('--pause',
                      type=int,
                      default=0,
                      help='model 을 load 할때 1로 설정됩니다.')
    config = args.parse_args()

    # base model architecture
    base_model = "vgg16"
    model = util.select_base_model(base_model)
    # new architecture code here
    model.summary()

    # bind model
    bind_model(model)

    if config.pause:
        nsml.paused(scope=locals())

    if config.mode == 'train':
        bTrainmode = True

        # load weights
        nsml.load(checkpoint=base_model,
                  session=util.model_name2session(base_model))
        nsml.save('saved')
        exit()
def main():

    parser = argparse.ArgumentParser()
    # Required parameters, we defined additional arguments for experiment
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name",
    )
    parser.add_argument(
        "--load_cache",
        action="store_true",
        help="load data from cached session",
    )
    parser.add_argument(
        "--save_cache",
        action="store_true",
        help="save loaded dataset into cache"
    )
    parser.add_argument(
        "--cached_session_pretrain",
        default="",
        type=str,
        help="Path to cache where 'Span-Pretraining' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_pretrain_qa",
        default="",
        type=str,
        help="Path to cache where 'QA-Pretraining' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_train",
        default="",
        type=str,
        help="Path to cache where given 'training' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_dev",
        default="",
        type=str,
        help="Path to cache where given 'development set' is stored",
    )
    parser.add_argument(
        "--load_model",
        action="store_true",
        help="use pretrained model from previous sessions",
    )   
    parser.add_argument(
        "--load_model_session",
        default="",
        type=str,
        help="Path to pre-trained model",
    )
    parser.add_argument(
        "--load_model_checkpoint",
        default="",
        type=str,
        help="Path to pre-trained model",
    )    
    parser.add_argument(
        "--just_for_save",
        action="store_true",
        help="save checkpoint and terminate immediately",
    )
    parser.add_argument(
        "--freeze_embedding",
        action="store_true",
        help="finetuning just classification layer",
    ) 
    parser.add_argument(
        "--mix_qa",
        action="store_true",
        help="mix qa set for variance",
    )
    parser.add_argument(
        "--mix_portion",
        type=float,
        default=0.5,
        help="defines portion of qa pairs to be reconstructed"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help="If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help="If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
             "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
             "be truncated to this length.",
    )
    parser.add_argument("--do_pretrain_span", action="store_true", help="Whether to run span-pretraining.")
    parser.add_argument("--do_pretrain_qa", action="store_true", help="Whether to run qa-pretraining.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", default=True,
        action="store_true", help="Run evaluation during training at each logging step."
    )
    parser.add_argument("--do_initial_validation", action="store_true", help="Whether to run initial validation")
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
             "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help="If true, all of the warnings related to data processing will be printed. "
             "A number of warnings are expected for a normal SQuAD evaluation.",
    )

    parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
             "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")

    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")

    ### DO NOT MODIFY THIS BLOCK ###
    # arguments for nsml
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--mode', type=str, default='train')
    ################################

    args = parser.parse_args()

    # for NSML
    args.data_dir = os.path.join(DATASET_PATH, args.data_dir)

    if (
            os.path.exists(args.output_dir)
            and os.listdir(args.output_dir)
            and args.do_train
            and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename='log.log'
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    args.model_type = args.model_type.lower()

    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")
    # tokenizer.add_special_tokens({"additional_special_tokens" : ["[QUES]"]})
    # print("vocabsize: {}".format(tokenizer.vocab_size))
    # print("example")
    # print(tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]"))
    model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is
    # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running
    # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")


    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu >  0:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    model.to(args.device)


    ### DO NOT MODIFY THIS BLOCK ###
    if IS_ON_NSML:
        bind_nsml(model, tokenizer, args)
        if args.pause:
            nsml.paused(scope=locals())
    ################################

    logger.info("Training/evaluation parameters %s", args) 




    # bind_nsml(model, tokenizer, args)

    if args.load_model:
        tmp_args = parser.parse_args()
        nsml.copy(args, tmp_args)
        nsml.load(checkpoint=args.load_model_checkpoint, session=args.load_model_session)
        nsml.copy(tmp_args, args)
    
    if args.just_for_save:
        nsml.save("test")
        return

    # initial validation
    if args.do_initial_validation:
        logger.info("Initinal Validation start")
        result = evaluate(args, model, tokenizer, prefix="")
        _f1, _exact = result["f1"], result["exact"]

        logger.info(
            "f1_val = {}, exact_val = {}" \
            .format(_f1, _exact))
        if IS_ON_NSML:
            nsml.report(summary=True, step=0, f1=_f1, exact=_exact)

    # 'Span' Pretraining
    if args.do_pretrain_span:
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False)
        t = time.time() - t
        logger.info("loading pretrain data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True)
        logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss)
        nsml.save("pretrained_span")

    # 'QA' Pretraining
    if args.do_pretrain_qa:
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=True)
        t = time.time() - t
        logger.info("loading pretrain data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True)
        logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss)
        nsml.save("pretrained_span+qa")

    # Training
    if args.do_train:
        if args.freeze_embedding:
            for param in model.module.electra.parameters():
                param.requires_grad = False
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False)
        t = time.time() - t
        logger.info("loading train data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
Example #23
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(
        description='Speech hackathon lilililill model')
    parser.add_argument(
        '--max_epochs',
        type=int,
        default=1000,
        help='number of max epochs in training (default: 1000)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')

    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-03,
                        help='learning rate (default: 0.001)')
    parser.add_argument('--num_mels',
                        type=int,
                        default=80,
                        help='number of the mel bands (default: 80)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='batch size in training (default: 128)')
    parser.add_argument("--num_thread",
                        type=int,
                        default=4,
                        help='number of the loading thread (default: 4)')
    parser.add_argument('--num_hidden_enc',
                        type=int,
                        default=1024,
                        help='hidden size of model (default: 1024)')
    parser.add_argument('--num_hidden_dec',
                        type=int,
                        default=512,
                        help='hidden size of model decoder (default: 512)')
    parser.add_argument(
        '--nsc_in_ms',
        type=int,
        default=50,
        help='Number of sample size per time segment in ms (default: 50)')

    parser.add_argument(
        '--ref_repeat',
        type=int,
        default=1,
        help='Number of repetition of reference seq2seq (default: 1)')
    parser.add_argument('--loss_lim',
                        type=float,
                        default=0.05,
                        help='Minimum loss threshold (default: 0.05)')

    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument('--memo',
                        type=str,
                        default='',
                        help='Comment you wish to leave')
    parser.add_argument('--debug',
                        type=str,
                        default='False',
                        help='debug mode')

    parser.add_argument('--load', type=str, default=None)

    args = parser.parse_args()

    batch_size = args.batch_size
    num_thread = args.num_thread
    num_mels = args.num_mels

    char2index, index2char = load_label('./hackathon.labels')
    SOS_token = char2index['<s>']  # '<sos>' or '<s>'
    EOS_token = char2index['</s>']  # '<eos>' or '</s>'
    PAD_token = char2index['_']  # '-' or '_'

    unicode_jamo_list = My_Unicode_Jamo_v2()
    # logger.info(''.join(unicode_jamo_list))

    # logger.info('This is a new main2.py')

    tokenizer = Tokenizer(unicode_jamo_list)
    jamo_tokens = tokenizer.word2num(unicode_jamo_list)
    # logger.info('Tokens: {}'.format(jamo_tokens))

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    net = Mel2SeqNet_v2(num_mels, args.num_hidden_enc, args.num_hidden_dec,
                        len(unicode_jamo_list), device)
    net_optimizer = optim.Adam(net.parameters(), lr=args.lr)
    ctc_loss = nn.CTCLoss().to(device)

    # net_B = Seq2SeqNet(512, jamo_tokens, char2index, device) #########
    net_B = Seq2SeqNet_v2(1024, jamo_tokens, char2index, device)  #########
    net_B_optimizer = optim.Adam(net_B.parameters(), lr=args.lr)  #########
    net_B_criterion = nn.NLLLoss(reduction='none').to(device)  #########

    bind_model(net, net_B, net_optimizer, net_B_optimizer, index2char,
               tokenizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    if args.load != None:
        # nsml.load(checkpoint='saved', session='team47/sr-hack-2019-dataset/' + args.load)
        nsml.load(checkpoint='model',
                  session='team47/sr-hack-2019-dataset/' + args.load)
        nsml.save('saved')

    for g in net_optimizer.param_groups:
        g['lr'] = 1e-06

    for g in net_B_optimizer.param_groups:
        g['lr'] = 1e-06

    for g in net_optimizer.param_groups:
        logger.info(g['lr'])

    for g in net_B_optimizer.param_groups:
        logger.info(g['lr'])

    wav_paths, script_paths, korean_script_paths = get_paths(DATASET_PATH)
    logger.info('Korean script path 0: {}'.format(korean_script_paths[0]))

    logger.info('wav_paths len: {}'.format(len(wav_paths)))
    logger.info('script_paths len: {}'.format(len(script_paths)))
    logger.info('korean_script_paths len: {}'.format(len(korean_script_paths)))

    # Load Korean Scripts

    korean_script_list, jamo_script_list = get_korean_and_jamo_list_v2(
        korean_script_paths)

    logger.info('Korean script 0: {}'.format(korean_script_list[0]))
    logger.info('Korean script 0 length: {}'.format(len(
        korean_script_list[0])))
    logger.info('Jamo script 0: {}'.format(jamo_script_list[0]))
    logger.info('Jamo script 0 length: {}'.format(len(jamo_script_list[0])))

    script_path_list = get_script_list(script_paths, SOS_token, EOS_token)

    ground_truth_list = [
        (tokenizer.word2num(['<s>'] + list(jamo_script_list[i]) + ['</s>']))
        for i in range(len(jamo_script_list))
    ]

    # 90% of the data will be used as train
    split_index = int(0.95 * len(wav_paths))

    wav_path_list_train = wav_paths[:split_index]
    ground_truth_list_train = ground_truth_list[:split_index]
    korean_script_list_train = korean_script_list[:split_index]
    script_path_list_train = script_path_list[:split_index]

    wav_path_list_eval = wav_paths[split_index:]
    ground_truth_list_eval = ground_truth_list[split_index:]
    korean_script_list_eval = korean_script_list[split_index:]
    script_path_list_eval = script_path_list[split_index:]

    logger.info('Total:Train:Eval = {}:{}:{}'.format(len(wav_paths),
                                                     len(wav_path_list_train),
                                                     len(wav_path_list_eval)))

    preloader_eval = Threading_Batched_Preloader_v2(wav_path_list_eval,
                                                    ground_truth_list_eval,
                                                    script_path_list_eval,
                                                    korean_script_list_eval,
                                                    batch_size,
                                                    num_mels,
                                                    args.nsc_in_ms,
                                                    is_train=True)
    preloader_train = Threading_Batched_Preloader_v2(wav_path_list_train,
                                                     ground_truth_list_train,
                                                     script_path_list_train,
                                                     korean_script_list_train,
                                                     batch_size,
                                                     num_mels,
                                                     args.nsc_in_ms,
                                                     is_train=False)

    best_loss = 1e10
    best_eval_cer = 1e10

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(args.max_epochs):

        logger.info((datetime.now().strftime('%m-%d %H:%M:%S')))

        net.train()
        net_B.train()

        preloader_train.initialize_batch(num_thread)
        loss_list_train = list()
        seq2seq_loss_list_train = list()
        seq2seq_loss_list_train_ref = list()

        logger.info("Initialized Training Preloader")
        count = 0

        total_dist = 0
        total_length = 1
        total_dist_ref = 0
        total_length_ref = 1

        while not preloader_train.end_flag:
            batch = preloader_train.get_batch()
            # logger.info(psutil.virtual_memory())
            # logger.info("Got Batch")
            if batch is not None:
                # logger.info("Training Batch is not None")
                tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch
                pred_tensor, loss = train(net, net_optimizer, ctc_loss,
                                          tensor_input.to(device),
                                          ground_truth.to(device),
                                          length_list.to(device), device)
                loss_list_train.append(loss)

                ####################################################

                jamo_result = Decode_Prediction_No_Filtering(
                    pred_tensor, tokenizer)

                true_string_list = Decode_Num_Script(
                    batched_num_script.detach().cpu().numpy(), index2char)

                for i in range(args.ref_repeat):
                    lev_input_ref = ground_truth

                    lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_train(
                        lev_input_ref.to(device),
                        batched_num_script.to(device),
                        batched_num_script_loss_mask.to(device),
                        net_B_optimizer, net_B_criterion)

                pred_string_list_ref = Decode_Lev_Prediction(
                    lev_pred_ref, index2char)
                seq2seq_loss_list_train_ref.append(seq2seq_loss_ref)
                dist_ref, length_ref = char_distance_list(
                    true_string_list, pred_string_list_ref)

                pred_string_list = [None]

                dist = 0
                length = 0

                if (loss < args.loss_lim):
                    lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor)
                    lev_pred, attentions, seq2seq_loss = net_B.net_train(
                        lev_input.to(device), batched_num_script.to(device),
                        batched_num_script_loss_mask.to(device),
                        net_B_optimizer, net_B_criterion)
                    pred_string_list = Decode_Lev_Prediction(
                        lev_pred, index2char)
                    seq2seq_loss_list_train.append(seq2seq_loss)
                    dist, length = char_distance_list(true_string_list,
                                                      pred_string_list)

                total_dist_ref += dist_ref
                total_length_ref += length_ref

                total_dist += dist
                total_length += length

                count += 1

                if count % 25 == 0:
                    logger.info("Train: Count {} | {} => {}".format(
                        count, true_string_list[0], pred_string_list_ref[0]))

                    logger.info("Train: Count {} | {} => {} => {}".format(
                        count, true_string_list[0], jamo_result[0],
                        pred_string_list[0]))

            else:
                logger.info("Training Batch is None")

        # del preloader_train

        # logger.info(loss_list_train)
        train_loss = np.mean(np.asarray(loss_list_train))
        train_cer = np.mean(np.asarray(total_dist / total_length))
        train_cer_ref = np.mean(np.asarray(total_dist_ref / total_length_ref))

        logger.info("Mean Train Loss: {}".format(train_loss))
        logger.info("Total Train CER: {}".format(train_cer))
        logger.info("Total Train Reference CER: {}".format(train_cer_ref))

        preloader_eval.initialize_batch(num_thread)
        loss_list_eval = list()
        seq2seq_loss_list_eval = list()
        seq2seq_loss_list_eval_ref = list()

        logger.info("Initialized Evaluation Preloader")

        count = 0
        total_dist = 0
        total_length = 1
        total_dist_ref = 0
        total_length_ref = 1

        net.eval()
        net_B.eval()

        while not preloader_eval.end_flag:
            batch = preloader_eval.get_batch()
            if batch is not None:
                tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch
                pred_tensor, loss = evaluate(net, ctc_loss,
                                             tensor_input.to(device),
                                             ground_truth.to(device),
                                             length_list.to(device), device)
                loss_list_eval.append(loss)

                ####################

                jamo_result = Decode_Prediction_No_Filtering(
                    pred_tensor, tokenizer)

                true_string_list = Decode_Num_Script(
                    batched_num_script.detach().cpu().numpy(), index2char)

                lev_input_ref = ground_truth
                lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_eval(
                    lev_input_ref.to(device), batched_num_script.to(device),
                    batched_num_script_loss_mask.to(device), net_B_criterion)

                pred_string_list_ref = Decode_Lev_Prediction(
                    lev_pred_ref, index2char)
                seq2seq_loss_list_train_ref.append(seq2seq_loss_ref)
                dist_ref, length_ref = char_distance_list(
                    true_string_list, pred_string_list_ref)

                lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor)
                lev_pred, attentions, seq2seq_loss = net_B.net_eval(
                    lev_input.to(device), batched_num_script.to(device),
                    batched_num_script_loss_mask.to(device), net_B_criterion)
                pred_string_list = Decode_Lev_Prediction(lev_pred, index2char)
                seq2seq_loss_list_train.append(seq2seq_loss)
                dist, length = char_distance_list(true_string_list,
                                                  pred_string_list)

                total_dist_ref += dist_ref
                total_length_ref += length_ref

                total_dist += dist
                total_length += length

                count += 1

                ####################

                if count % 10 == 0:
                    logger.info("Eval: Count {} | {} => {}".format(
                        count, true_string_list[0], pred_string_list_ref[0]))

                    logger.info("Eval: Count {} | {} => {} => {}".format(
                        count, true_string_list[0], jamo_result[0],
                        pred_string_list[0]))

            else:
                logger.info("Training Batch is None")

        eval_cer = total_dist / total_length
        eval_cer_ref = total_dist_ref / total_length_ref
        eval_loss = np.mean(np.asarray(loss_list_eval))

        logger.info("Mean Evaluation Loss: {}".format(eval_loss))
        logger.info("Total Evaluation CER: {}".format(eval_cer))
        logger.info("Total Evaluation Reference CER: {}".format(eval_cer_ref))

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    train_epoch__cer_ref=train_cer_ref,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer,
                    eval__cer_ref=eval_cer_ref)

        nsml.save(args.save_name)
        best_model = (eval_cer < best_eval_cer)
        if best_model:
            nsml.save('best')
            best_eval_cer = eval_cer

        logger.info("Inference Check")
def load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, val_or_test="val", is_pretrain=False, qa_style=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    cached_features_file = "cache_{}".format("dev" if evaluate else "train")
    # confirm mixing should be applied 
    do_mix = (args.mix_qa and not evaluate) and (is_pretrain and val_or_test == "val")
    # load from cache if it is possible
    if val_or_test=="val" and args.load_cache:
        cached_session = args.cached_session_dev if evaluate else args.cached_session_train
        if is_pretrain:
            cached_session = args.cached_session_pretrain
            if qa_style:
                cached_session = args.cached_session_pretrain_qa
        logger.info("Loading features from cached file %s in %s", cached_features_file, cached_session)

        features_and_datasets = {}
        def load_data(dir_name):
            tmp = torch.load(os.path.join(dir_name, '{}.pt'.format(cached_features_file)))
            print(tmp.keys())
            nsml.copy(tmp, features_and_datasets)
        
        nsml.bind(load=load_data)
        nsml.load(checkpoint=cached_features_file, session=cached_session)
        bind_nsml(model, tokenizer, args)
        print(features_and_datasets.keys())
        features, dataset, examples = (
            features_and_datasets["features"],
            features_and_datasets["dataset"],
            features_and_datasets["examples"],
        )          

    else:
        logger.info("Creating features from dataset file at %s", cached_features_file)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json"
                examples = processor.get_eval_examples(args.data_dir, filename=filename)
            else:
                if is_pretrain:
                    examples = processor.get_pretrain_examples(args.data_dir, filename=args.train_file, qa_style=qa_style)
                else:
                    examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
    # apply mixing
    if do_mix:
        num_qa = len(examples)
        mix_batch_size = int(args.mix_portion * num_qa)
        if mix_batch_size % 2 == 1:
            mix_batch_size -= 1
        mix_batch = np.array(random.sample(range(num_qa), mix_batch_size)).reshape(-1, 2)
        for i, (k,v) in enumerate(mix_batch):
            example_k, example_v = examples[k], examples[v]
            ans_k, ans_v = example_k.answer_text, example_v.answer_text
            example_k.context_text, example_v.context_text = example_v.context_text, example_k.context_text
            assert not (example_k.is_impossible or example_v.is_impossible)
            if ans_k != ans_v:
                example_k.is_impossible, example_v.is_impossible = True, True
                example_k.start_position_character, example_v.start_position_character = None, None
            else:
                example_k.start_position, example_v.end_position = example_v.start_position, example_k.end_position
    if do_mix or not (val_or_test=="val" and args.load_cache):
        print("Starting squad_convert_examples_to_features")
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
        print("Complete squad_convert_examples_to_features")

    # make cache in the session if it is required
    if val_or_test=="val" and args.save_cache:
        features_and_datasets = {"dataset": dataset, "examples": examples, "features": features}

        def save_data(dir_name):
            os.makedirs(dir_name, exist_ok=True)
            torch.save(features_and_datasets, os.path.join(dir_name, '{}.pt'.format(cached_features_file)))
            logger.info("Save data at {}".format(dir_name))

        nsml.bind(save=save_data)
        nsml.save(cached_features_file)
        bind_nsml(model, tokenizer, args)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Example #25
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature']
        target = ['label']
        
        
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        
        with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
        
        
        
        idx_artics_all = item['article_id'].tolist()
        
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        
       
            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        print('_infer root - : ', DATASET_PATH)
        print('test')
        model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['401','team_62/airush2/176']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        print('successfully loaded')

    if (args.mode == 'train'):
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    print('end_main')

    if args.pause:
        nsml.paused(scope=local)
Example #26
0
    if mode != 'train':
        sys.exit(0)

    # if C.get()['infer_mode'] == 'face':
    #     targets_only = []
    #     lbs = CustomDataset(TRAIN_DATASET_PATH).targets
    #     for lb_id in range(num_classes):
    #         if lbs.count(lb_id) > 150:
    #             continue
    #         targets_only.append(lb_id)
    #     print(targets_only)

    if config.transfer:
        # nsml.load(checkpoint='transfer', session='team_286/4_cls_food/89')
        nsml.load(checkpoint='100', session='team_286/4_cls_food/103')  # cv=1 cutmix 0.5
        # nsml.load(checkpoint='55', session='team_286/7_icls_face/2')
        # nsml.load(checkpoint='transfer', session='team_286/8_iret_food/12')
        # nsml.load(checkpoint='20', session='team_286/9_iret_car/16')
        nsml.save('resave')
        sys.exit(0)

    tr_loader, val_loader, val_label = data_loader_with_split(root=TRAIN_DATASET_PATH, cv_ratio=config.ratio, cv=config.cv, batch_size=C.get()['batch'])
    time_ = datetime.datetime.now()
    best_val_top1 = 0

    dataiter = iter(tr_loader)
    num_steps = 100000 // C.get()['batch']

    from pystopwatch2 import PyStopwatch
Example #27
0
                max_to_keep=cf.keep_checkpoint_max)
            saver_for_restore.restore(sess, checkpoint_path)
    saver = tf.train.Saver(tf.global_variables(),
                           max_to_keep=cf.keep_checkpoint_max)
    num_trained_images = 0

    bind_model(saver, sess, images_ph, embeddings_op, cf)

    if cf.pause:
        nsml.paused(scope=locals())

    bTrainmode = False
    if cf.mode == 'train':
        bTrainmode = True
        if cf.nsml_checkpoint is not None and cf.nsml_session is not None:
            nsml.load(checkpoint=cf.nsml_checkpoint, session=cf.nsml_session)
        while True:
            try:
                start = time.time()

                if cf.use_pair_sampling:
                    print("pair sampling")
                    tmp_images, tmp_labels = sess.run([images, labels])
                    pair_indices = set()
                    single_index_map = {}
                    label_buffer = {}
                    for i, tmp_label in enumerate(tmp_labels):
                        if tmp_label in label_buffer:
                            pair_indices.add(i)
                            pair_indices.add(label_buffer[tmp_label])
                            if tmp_label in single_index_map:
Example #28
0
def main():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')

    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=WORD_MAXLEN,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument(
        '--word',
        action='store_true',
        help='Train/Predict model using word based label (default: False)')
    parser.add_argument('--gen_label_index',
                        action='store_true',
                        help='Generate word label index map(default: False)')
    parser.add_argument('--iteration', type=str, help='Iteratiom')
    parser.add_argument('--premodel_session',
                        type=str,
                        help='Session name of premodel')

    # transformer model parameter
    parser.add_argument('--d_model',
                        type=int,
                        default=128,
                        help='transformer_d_model')
    parser.add_argument('--n_head',
                        type=int,
                        default=8,
                        help='transformer_n_head')
    parser.add_argument('--num_encoder_layers',
                        type=int,
                        default=4,
                        help='num_encoder_layers')
    parser.add_argument('--num_decoder_layers',
                        type=int,
                        default=4,
                        help='transformer_num_decoder_layers')
    parser.add_argument('--dim_feedforward',
                        type=int,
                        default=2048,
                        help='transformer_d_model')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.1,
                        help='transformer_dropout')

    # transformer warmup parameter
    parser.add_argument('--warmup_multiplier',
                        type=int,
                        default=3,
                        help='transformer_warmup_multiplier')
    parser.add_argument('--warmup_epoch',
                        type=int,
                        default=10,
                        help='transformer_warmup_epoch')

    args = parser.parse_args()
    char_loader = CharLabelLoader()
    char_loader.load_char2index('./hackathon.labels')
    label_loader = char_loader
    if args.word:
        if args.gen_label_index:
            generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH)
            from subprocess import call
            call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True)
        # ??? ??? ??? ??
        word_loader = CharLabelLoader()
        word_loader.load_char2index('./hackathon.pos.labels')
        label_loader = word_loader
        if os.path.exists(TRAIN_LABEL_CHAR_PATH):
            generate_word_label_file(char_loader, word_loader,
                                     TRAIN_LABEL_POS_PATH,
                                     TRAIN_LABEL_CHAR_PATH)
    char2index = label_loader.char2index
    index2char = label_loader.index2char
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    ############ model
    print("model: transformer")
    # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers,
    #                     dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN)

    encoder = Encoder(d_input=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      pe_maxlen=SOUND_MAXLEN)
    decoder = Decoder(sos_id=SOS_token,
                      eos_id=EOS_token,
                      n_tgt_vocab=len(char2index),
                      d_word_vec=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      tgt_emb_prj_weight_sharing=True,
                      pe_maxlen=SOUND_MAXLEN)
    model = Transformer(encoder, decoder)

    optimizer = TransformerOptimizer(
        torch.optim.Adam(model.parameters(),
                         lr=0.0004,
                         betas=(0.9, 0.98),
                         eps=1e-09))

    ############/

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)
    """
    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)

    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs)
    scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine)
    
    
    criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)
    """

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    # target_path = os.path.join(DATASET_PATH, 'train_label')
    target_path = TRAIN_LABEL_CHAR_PATH
    if args.word:
        target_path = TRAIN_LABEL_POS_PATH
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    if args.iteration:
        if args.premodel_session:
            nsml.load(args.iteration, session=args.premodel_session)
            logger.info(f'Load {args.premodel_session} {args.iteration}')
        else:
            nsml.load(args.iteration)
            logger.info(f'Load {args.iteration}')
    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        # learning rate scheduler

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      optimizer, device, train_begin,
                                      args.workers, 10, args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        print("~~~~~~~~~~~~")

        if epoch == 10 or (epoch > 48 and epoch % 10 == 9):
            valid_queue = queue.Queue(args.workers * 2)
            valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                          args.batch_size, 0)
            valid_loader.start()

            eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                           device, args.max_len,
                                           args.batch_size)
            logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                        (epoch, eval_loss, eval_cer))

            valid_loader.join()

            nsml.report(False,
                        step=epoch,
                        train_epoch__loss=train_loss,
                        train_epoch__cer=train_cer,
                        eval__loss=eval_loss,
                        eval__cer=eval_cer)

            best_model = (eval_loss < best_loss)
            nsml.save(args.save_name)

            if best_model:
                nsml.save('best')
                best_loss = eval_loss
Example #29
0
        #summary(model,input_size=(3,224,224))
    else:
        model = Baseline(args.hidden_size, args.output_size)
    optimizer = optim.Adam(model.parameters(), args.learning_rate)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                        patience=1,
                                                        verbose=True)
    criterion = nn.CrossEntropyLoss()  #multi-class classification task

    model = model.to(device)
    model.train()

    # DONOTCHANGE: They are reserved for nsml
    bind_model(model)
    # below the nsml load
    nsml.load(checkpoint='15', session='team_62/airush1/40')
    nsml.save('stillgoing')

    if args.pause:
        nsml.paused(scope=locals())
    if args.mode == "train":
        # Warning: Do not load data before this line
        dataloader = train_dataloader(args.input_size, args.batch_size,
                                      args.num_workers)
        for epoch_idx in range(1, args.epochs + 1):
            total_loss = 0
            total_correct = 0
            for batch_idx, (image, tags) in enumerate(dataloader):
                optimizer.zero_grad()
                image = image.to(device)  #torch.Size([64, 3, 224, 224])
Example #30
0
    def infer(test_image_data_path, test_meta_data_path):
        # DONOTCHANGE This Line
        test_meta_data = pd.read_csv(test_meta_data_path,
                                     delimiter=',',
                                     header=0)
        # dropout ratio
        ensemble0 = [['team_62/airush1/320', '02'],
                     ['team_62/airush1/320', '12'],
                     ['team_62/airush1/320', '22'],
                     ['team_62/airush1/98', '4']]  # effi
        ensemble1 = [['team_62/airush1/415', '03'],
                     ['team_62/airush1/415', '13'],
                     ['team_62/airush1/415', '23'],
                     ['team_62/airush1/415', '33']]  # effi
        ensemble2 = [['team_62/airush1/678', '02'],
                     ['team_62/airush1/678', '12'],
                     ['team_62/airush1/185',
                      '17']]  #[['team_62/airush1/185','17']] # resnet50
        ensemble3 = [['team_62/airush1/683', '02'],
                     ['team_62/airush1/683',
                      '12']]  # oct ['team_62/airush1/409','18']
        #ensemble4 = [['team_62/airush1/605','8']]  # SKNet # transforms 에서 normalize 반드시 뺄 것
        input_size = 224  # you can change this according to your model.
        batch_size = 512  # you can change this. But when you use 'nsml submit --test' for test infer, there are only 200 number of data.
        device = 0

        w0 = 0.125
        w2 = 0.166
        w3 = 0.25

        predict_list = []
        for i in range(4):  # ensemble 개수
            #print('i th inference')

            dataloader = DataLoader(AIRushDataset(
                test_image_data_path,
                test_meta_data,
                label_path=None,
                transform=transforms.Compose([
                    transforms.Resize((input_size, input_size)),
                    transforms.RandomRotation(20),
                    transforms.ToTensor(),
                    transforms.Normalize([0.485, 0.456, 0.406],
                                         [0.229, 0.224, 0.225])
                ])),
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=4,
                                    pin_memory=True)
            # 9:10 결과보고 뺄지 말지 결정
            # Let's do ensemble!!!
            if (i == 0):
                # 'efficientNet_b0 : ensemble 4 - fold'
                for j in range(4):
                    model_name = 'efficientnet-b0'
                    model = EfficientNet.from_name(model_name)
                    bind_model(model)
                    nsml.load(checkpoint=str(ensemble0[j][1]),
                              session=str(ensemble0[j][0]))
                    model.to(device)
                    model.eval()
                    predict_output_list = []
                    with torch.no_grad():
                        for batch_idx, image in enumerate(dataloader):
                            image = image.to(device)
                            output = model(image).double()
                            output_prob = to_np(F.softmax(output, dim=1))
                            predict_output_list.append(output_prob * w0)
                    predict_output_list = np.concatenate(predict_output_list,
                                                         axis=0)
                    predict_list.append(predict_output_list)
            elif (i == 1):
                # resnet50
                for j in range(3):

                    model = resnext50(
                        num_classes=args.output_size)  # 모델에 맞게 수정
                    bind_model(model)
                    nsml.load(checkpoint=str(ensemble2[j][1]),
                              session=str(ensemble2[j][0]))  # 모델에 맞게 수정
                    model.to(device)
                    model.eval()
                    predict_output_list = []
                    with torch.no_grad():
                        for batch_idx, image in enumerate(dataloader):
                            image = image.to(device)
                            output = model(image).double()
                            output_prob = to_np(F.softmax(output, dim=1))
                            #print(output_prob)
                            predict_output_list.append(output_prob * w2)
                    predict_output_list = np.concatenate(predict_output_list,
                                                         axis=0)
                    predict_list.append(predict_output_list)
                    #print('resnet model')
            elif (i == 2):
                # resnet50
                for j in range(2):
                    model = OctResNet(
                        Bottleneck, [3, 4, 6, 3],
                        num_classes=args.output_size)  # 모델에 맞게 수정
                    bind_model(model)
                    nsml.load(checkpoint=str(ensemble3[j][1]),
                              session=str(ensemble3[j][0]))  # 모델에 맞게 수정
                    model.to(device)
                    model.eval()
                    predict_output_list = []
                    with torch.no_grad():
                        for batch_idx, image in enumerate(dataloader):
                            image = image.to(device)
                            output = model(image).double()
                            output_prob = to_np(F.softmax(output, dim=1))
                            #print(output_prob)
                            predict_output_list.append(output_prob * w3)  # 수정
                    predict_output_list = np.concatenate(predict_output_list,
                                                         axis=0)
                    predict_list.append(predict_output_list)
                    #print('resnet model')

            # ensemble 추가

            # 마지막 SENet 추가

        predict_vector = np.argmax(np.sum(predict_list, axis=0), axis=1)

        return predict_vector  # this return type should be a numpy array which has shape of (138343)