Ejemplo n.º 1
0
    mean_square_error = tf.reduce_mean(tf.square(output_score - y_))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(
        mean_square_error)
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()

    # DONOTCHANGE: Reserved for nsml use
    bind_model(sess=sess, config=config)

    # DONOTCHANGE: They are reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())

    if config.mode == 'train':
        # 데이터를 로드합니다.
        dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
        dataset_len = len(dataset)

        if config.validate > 0:

            validate_len = int(dataset_len * config.validate)

            ds_review = np.array([np.array(review) for review, _ in dataset])
            ds_label = np.array([int(label) for _, label in dataset])

            global_perm = np.random.RandomState(
                seed=777).permutation(dataset_len)
            ds_review, ds_label = ds_review[global_perm], ds_label[global_perm]

            dataset = MovieReviewDataset(remake=True,
                                         new_review=ds_review[validate_len:],
Ejemplo n.º 2
0
    model = get_model(config)
    model.summary()

    # DONOTCHANGE: Reserved for nsml use
    print("nsml binding...")
    bind_model(model, config)

    # DONOTCHANGE: They are reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())

    # 학습 모드일 때 사용합니다. (기본값)
    if config.mode == 'train':
        # 데이터를 로드합니다.
        print("data loading...")
        dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)

        x_pre = np.array(dataset.reviews_pre)
        x_post = np.array(dataset.reviews_post)
        y = np.array(dataset.labels)

        # epoch마다 학습을 수행합니다.
        nsml_callback = Nsml_Callback()
        #         checkpoint = ModelCheckpoint('./best.hdf5', monitor='val_loss', save_best_only=True, mode='min', period=1)
        #         dataset_val = MovieReviewDataset_val(DATASET_PATH, config.strmaxlen)
        #         x_val = np.array(dataset_val.reviews)
        #         y_val = np.array(dataset_val.labels)
        print("model training...")
        hist = model.fit(
            [x_pre, x_post],
            [y, y, y, y, y, y, y, y, y, y, y],
    args.add_argument('--strmaxlen', type=int, default=200)
    args.add_argument('--embedding', type=int, default=8)
    config = args.parse_args()

    learning_rate = 1e-3
    grad_clip = True
    dropout = 0.2
    n_char = 4500
    n_embed = 256
    n_hidden = 256

    if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
        DATASET_PATH = '../sample_data/movie_review/'

    if config.mode == 'train':
        dataset = MovieReviewDataset(DATASET_PATH, flip=True)

    else:
        dataset = MovieReviewDataset('', build=False, flip=False)

    model = Movie(n_char, n_embed, n_hidden, dropout)
    model_run = Movie(n_char, n_embed, n_hidden, dropout)
    if GPU_NUM:
        model = model.cuda()
        model_run.cuda()
    accumulate(model_run, model, 0)

    # DONOTCHANGE: Reserved for nsml use
    bind_model(model_run, dataset, config)

    criterion = nn.MSELoss()
Ejemplo n.º 4
0
    # DONOTCHANGE: Reserved for nsml use
    bind_model(model, config)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)

    # DONOTCHANGE: They are reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())


    # 학습 모드일 때 사용합니다. (기본값)
    if config.mode == 'train':
        # 데이터를 로드합니다.
        t0 = time.time()
        dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen, max_size=config.max_dataset)
        print("dataset loaded %.2f s" % (time.time() - t0))
        pin_memory = USE_GPU > 0
        if config.no_eval:
            train_loader = DataLoader(dataset=dataset, batch_size=config.batch,
                                      shuffle=True, collate_fn=collate_fn,
                                      num_workers=2, pin_memory=pin_memory)
            eval_loader = []
        else:
            train_sampler, eval_sampler = dataset.get_sampler()
            train_loader = DataLoader(dataset=dataset, batch_size=config.batch,
                                      sampler=train_sampler, collate_fn=collate_fn,
                                      num_workers=2, pin_memory=pin_memory)
            eval_loader = DataLoader(dataset=dataset, batch_size=config.batch,
                                      sampler=eval_sampler, collate_fn=collate_fn,
                                      num_workers=2, pin_memory=pin_memory)
Ejemplo n.º 5
0
    lgb_model = object
    
    models = (model, lgb_model, vect_word, vect_char)
    # DONOTCHANGE: Reserved for nsml use
    print("nsml binding...")
    bind_model(models, config)

    # DONOTCHANGE: They are reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())

    # 학습 모드일 때 사용합니다. (기본값)
    if config.mode == 'train':
        # 데이터를 로드합니다.
        print("data loading...")
        dataset = MovieReviewDataset(DATASET_PATH)
#         X_trn, X_val, Y_trn, Y_val= trn_val_seperation(dataset, 144570)
        X_trn, X_val, Y_trn, Y_val= trn_val_seperation(dataset, 3)
        
        # Vectorizer를 학습합니다
        vect_word, vect_char = vect_fit(X_trn, vect_word, vect_char)
        
        # Text를 Vector화 합니다
        X_trn = vect_transform(X_trn, vect_word, vect_char)
        X_val = vect_transform(X_val, vect_word, vect_char)
        
        #Dataset 구성
        train_data = lgb.Dataset(X_trn, Y_trn)
        valid_data = lgb.Dataset(X_val, Y_val, reference=train_data)
        gc.collect()
        
            word_embed: we,
            sent_len: sl,
            chars: cs,
            word_len: wl,
            sylls: ss,
            y_: label,
            is_training: train
        }

    # DONOTCHANGE: Reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())

    if config.mode == 'train':
        # 데이터를 로드합니다.
        dataset = MovieReviewDataset(DATASET_PATH, max_sent_len, max_word_len,
                                     max_syll_num)
        dataset_len = len(dataset)
        one_batch_size = dataset_len // config.batch
        if dataset_len % config.batch != 0:
            one_batch_size += 1

        if config.debug:
            debugset = MovieReviewDataset(DEBUG3_PATH, max_sent_len,
                                          max_word_len, max_syll_num)
            debugset_len = len(debugset)
            one_debug_size = debugset_len // config.batch
            if debugset_len % config.batch != 0:
                one_debug_size += 1

        train_step = 0
        best_ema = 99999.0
Ejemplo n.º 7
0
    # DONOTCHANGE: They are reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())

    ### Training mode
    # 학습 모드일 때 사용합니다. (기본값)
    if config.mode == 'train':
        # 데이터를 로드합니다.
        if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
            DATASET_PATH = '../sample_data/movie_review/'
        corpus = DP.Corpus(DATASET_PATH, total_train)
        print('[*]', 'Load corpus')

        # Load training data
        train_dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen,
                                           True, corpus)
        print('[*]', 'Load train dataset')
        train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=config.batch,
                                  shuffle=True,
                                  collate_fn=collate_fn,
                                  num_workers=1)
        total_train = len(train_loader)

        # Load validation data
        test_dataset = MovieReviewDataset(DATASET_PATH,
                                          config.strmaxlen,
                                          True,
                                          corpus,
                                          test=True)
        print('[*]', 'Load test dataset')
Ejemplo n.º 8
0
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run()

    # DONOTCHANGE: Reserved for nsml
    bind_model(sess=sess, config=config)

    def get_feed_dict(w, c, s, y, train=False):
        return {wx: w, cx_: c, sx_: s, y_: y, is_training: train}

    # DONOTCHANGE: Reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())

    if config.mode == 'train':
        # 데이터를 로드합니다.
        dataset = MovieReviewDataset(DATASET_PATH, max_word_num, max_char_num,
                                     max_syll_num)
        dataset_len = len(dataset)
        one_batch_size = dataset_len // config.batch
        if dataset_len % config.batch != 0:
            one_batch_size += 1

        if config.debug:
            debugset = MovieReviewDataset(DEBUG3_PATH, max_word_num,
                                          max_char_num, max_syll_num)
            debugset_len = len(debugset)
            one_debug_size = debugset_len // config.batch
            if debugset_len % config.batch != 0:
                one_debug_size += 1

        train_step = 0
        best_ema = 99999.0
    # Select model
    args.add_argument('--model', type=str, default='SentpieceModel', choices=['SentpieceModel', 'regression', 'classification', 'bilstmwithattn', 'cnntext'])
    config = args.parse_args()

    print('HAS_DATASET :', HAS_DATASET)
    print('IS_ON_NSML :', IS_ON_NSML)
    print('DATASET_PATH :', DATASET_PATH)
    print(config)

    sp = []
    wp_vocab = []
    preprcess_infer = {}
    if config.mode == 'train':
        sp, wp_vocab = build_vocab(config.mode, DATASET_PATH, vocab_size=config.vocab_size)
        dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen,
                                     max_word_len=config.max_words_len, max_wp_len=config.max_wp_len, n_class=11)
        vocab_size = len(dataset.i2wp)
    else:
        vocab_size = 19488 #19475

    model_type = {
        'SentpieceModel': SentpieceModel(vocab_size,
                                           char_emb_size=config.char_embedding,
                                           word_emb_size=config.word_embedding,
                                           hidden_size=config.hidden_dim,
                                           max_wp_len=config.max_wp_len,
                                           max_words_len=config.max_words_len)
    }

    models = [
        SentpieceModel(vocab_size,
Ejemplo n.º 10
0
    for m in range(num_models):
        models.append(Model(sess, "model" + str(m), config))

    tf.global_variables_initializer().run()

    # DONOTCHANGE: Reserved for nsml
    bind_model(sess=sess, config=config, model=models)

    # DONOTCHANGE: Reserved for nsml
    if config.pause:
        nsml.paused(scope=locals())

    if config.mode == 'train':
        # 데이터를 로드합니다.
        dataset = MovieReviewDataset(DATASET_PATH,
                                     config.strmaxlen,
                                     is_train=True)
        dataset_len = len(dataset)
        one_batch_size = dataset_len // config.batch
        if dataset_len % config.batch != 0:
            one_batch_size += 1

        # epoch마다 학습을 수행합니다.
        for epoch in range(config.epochs):
            #avg_loss = 0.0
            avg_cost_list = np.zeros(len(models))
            for i, (data,
                    labels) in enumerate(_batch_loader(dataset, config.batch)):
                labels = np.reshape(labels, (-1, 1))
                onehot_label = sess.run(
                    tf.reshape(tf.one_hot(labels, depth=11, dtype=tf.float32),
Ejemplo n.º 11
0
    model = CNNReg(config.vocasize, config.embedding, config.maxlen, GPU_NUM)
    if GPU_NUM: model = model.cuda()

    # DONOTCHANGE: Reserved for nsml use
    bind_model(model, config)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=config.lr)

    # DONOTCHANGE: They are reserved for nsml
    if config.pause: nsml.paused(scope=locals())

    # 학습 모드일 때 사용합니다. (기본값)
    if config.mode == 'train':
        dataset = MovieReviewDataset(DATASET_PATH, config.vocasize,
                                     config.minlen, config.maxlen)
        train_loader = DataLoader(dataset=dataset,
                                  batch_size=config.batch,
                                  shuffle=True,
                                  collate_fn=lambda data: zip(*data),
                                  num_workers=2)
        total_batch = len(train_loader)
        # epoch마다 학습을 수행합니다.
        for epoch in range(config.epochs):
            avg_loss = 0.0
            for i, (data, labels) in enumerate(train_loader):
                predictions = model(data)

                labels = Variable(torch.from_numpy(np.array(labels)))
                if GPU_NUM: labels = labels.cuda()