Beispiel #1
0
def init_test_data():
    '''
	Initialize testing data from database
	:return:
	'''
    conn = data_loader.get_connection()
    data_loader.init_database(conn)
    data_loader.load_test_data(conn)
Beispiel #2
0
    def test(self):
        """Test Function."""
        print("Testing the results")

        self.inputs = data_loader.load_test_data(
            self._dataset_name,
            False, self._do_flipping)

        self.model_test_setup()
        saver = tf.train.Saver()
        init = tf.global_variables_initializer()

        start = time.time()

        with tf.Session() as sess:
            sess.run(init)

            chkpt_fname = tf.train.latest_checkpoint(self._checkpoint_dir)
            saver.restore(sess, chkpt_fname)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)

            self._num_imgs_to_save = cyclegan_datasets.DATASET_TO_SIZES[
                self._dataset_name]
            self.save_images(sess, 0)

            coord.request_stop()
            coord.join(threads)
            print("rate", (time.time()-start))
Beispiel #3
0
    def _read(self, file_path: str) -> Iterator[Instance]:
        if self.augmentation == None:
            train_loader_fn = data_loader.load_train_data
        elif self.augmentation == 'EDA':
            train_loader_fn = data_loader.load_train_data_with_EDA
        elif self.augmentation == 'CDA':
            pass
        else:
            print("Invalid augmentation:", self.augmentation)

        if file_path == 'train':
            X, Y = train_loader_fn(self.scenario)
            X = X.tolist()
            Y = Y.tolist()
            for text, label in zip(X, Y):
                yield self.text_to_instance(
                    tokens=[Token(x) for x in self.tokenizer(text)],
                    ID='__',  # ID for every row
                    label=label)

        elif file_path == 'test':
            X, Y, IDs = data_loader.load_test_data()
            X = X.tolist()
            Y = Y.tolist()
            for text, label, ID in zip(X, Y, IDs):
                yield self.text_to_instance(
                    tokens=[Token(x) for x in self.tokenizer(text)],
                    ID=ID,  # unique ID for every test row
                    label=label)

        else:
            print("Invalid split parameter:", data)
Beispiel #4
0
def q1():
    '''Question 1
    '''
    model.load_state_dict(torch.load(F'{data_path}/{model_name}'))
    model.eval()
    p300 = P300(model=model)

    for person in selected_persons:
        brain, event = load_test_data(sorted_path, person)
        target = p300.get_target(brain, event)
        print(target)
def load_data(train_data_path='./aclImdb/train/', test_data_path='./aclImdb/test/'):
    # Load data
    print("Load Data...")
    Xtr_text, Ytr, Xva_text, Yva = load_train_data(train_data_path, 0.1)
    Xte_text, Yte = load_test_data(test_data_path)

    # Combine training and validation data:
    Xtr_text = np.append(Xtr_text, Xva_text)
    Ytr = np.append(Ytr, Yva)
    print("Done loading data!\n")

    return Xtr_text, Ytr, Xte_text, Yte
Beispiel #6
0
def test(fake_user):
    user_emb, ui_matrix = data_loader.load_user_info()
    test_item, test_attribute = data_loader.load_test_data()
    # similar_uid = get_similar_user(fake_user, user_emb, 20)
    ndcg_10 = ndcg_at_k(test_item, fake_user, user_emb, ui_matrix, 10)
    ndcg_20 = ndcg_at_k(test_item, fake_user, user_emb, ui_matrix, 20)
    p_10 = p_at_k(test_item, fake_user, user_emb, ui_matrix, 10)
    p_20 = p_at_k(test_item, fake_user, user_emb, ui_matrix, 20)
    map_10 = map_at_k(test_item, fake_user, user_emb, ui_matrix, 10)
    map_20 = map_at_k(test_item, fake_user, user_emb, ui_matrix, 20)
    print(
        'test:ndcg@10:{:.3f}, ndcg@20:{:.3f}, p@10:{:.3f}, p@20:{:.3f}, map@10:{:.3f}, map@20:{:.3f}'
        .format(ndcg_10, ndcg_20, p_10, p_20, map_10, map_20))
Beispiel #7
0
def test(model, tar = True):
    alpha = 0
    dataloader = data_loader.load_test_data(tar=tar, person=args.person)
    model.eval()
    n_correct = 0
    with torch.no_grad():
        for _, (t_data, t_label) in enumerate(dataloader):
            t_data, t_label = t_data.to(DEVICE), t_label.to(DEVICE)
            t_label = t_label.squeeze()
            class_output, _ = model(input_data=t_data, alpha=alpha)
            prob, pred = torch.max(class_output.data, 1)
            n_correct += (pred == t_label.long()).sum().item()
    acc = float(n_correct) / len(dataloader.dataset) * 100
    return acc
Beispiel #8
0
def test(model, dataset_name, epoch):
    alpha = 0
    dataloader = data_loader.load_test_data(dataset_name)
    model.eval()
    n_correct = 0
    with torch.no_grad():
        for _, (t_img, t_label) in enumerate(dataloader):
            t_img, t_label = t_img.to(DEVICE), t_label.to(DEVICE)
            class_output, _ = model(input_data=t_img, alpha=alpha)
            prob, pred = torch.max(class_output.data, 1)
            n_correct += (pred == t_label.long()).sum().item()

    acc = float(n_correct) / len(dataloader.dataset) * 100
    return acc
def train_and_predict():
    DataCreator()
    print('-' * 30)
    print('Loading and preprocessing train data...')
    print('-' * 30)
    imgs_train, imgs_mask_train = DataLoader()

    imgs_train = preprocess(imgs_train)
    imgs_mask_train = preprocess(imgs_mask_train)

    imgs_train = imgs_train.astype('float32')
    mean = np.mean(imgs_train)  # mean for data centering
    std = np.std(imgs_train)  # std for data normalization

    #imgs_train -= mean
    #imgs_train /= std

    imgs_mask_train = imgs_mask_train.astype('float32')
    imgs_mask_train /= 255.  # scale masks to [0, 1]

    print('-' * 30)
    print('Creating and compiling model...')
    print('-' * 30)
    model = get_unet()
    model_checkpoint = ModelCheckpoint('weights.h5',
                                       monitor='val_loss',
                                       save_best_only=True)

    print('-' * 30)
    print('Fitting model...')
    print('-' * 30)
    model.fit(imgs_train,
              imgs_mask_train,
              batch_size=32,
              nb_epoch=numberOfEpochs,
              verbose=1,
              shuffle=True,
              validation_split=0.2,
              callbacks=[model_checkpoint])

    #TESTS DATA --------------
    print('-' * 30)
    print('Loading and preprocessing test data...')
    print('-' * 30)
    imgs_test, imgs_id_test = load_test_data()
    imgs_test = preprocess(imgs_test)

    imgs_test = imgs_test.astype('float32')
    #    imgs_test -= mean
    imgs_test /= std

    print('-' * 30)
    print('Loading saved weights...')
    print('-' * 30)
    model.load_weights('weights.h5')

    print('-' * 30)
    print('Predicting masks on test data...')
    print('-' * 30)
    imgs_mask_test = model.predict(imgs_test, verbose=1)
    np.save('imgs_mask_test.npy', imgs_mask_test)

    print('-' * 30)
    print('Saving predicted masks to files...')
    print('-' * 30)
    pred_dir = 'preds'
    if not os.path.exists(pred_dir):
        os.mkdir(pred_dir)
    #for image, image_id in zip(imgs_mask_test, imgs_id_test):
    i = 0
    for image in imgs_mask_test:
        image = (image[:, :, 0] * 255.).astype(np.uint8)
        imsave(os.path.join(pred_dir, str(i) + '_pred.png'), image)
        i += 1
Beispiel #10
0
def main():
    if len(sys.argv) != 2 or sys.argv[1] not in ['svm', 'nn']:
        print("Invalid command. Expected 'svm' or 'nn'.")
        return

    c_name = sys.argv[1]
    print('Running job: TF-IDF vectorization and ' + c_name.upper() +
          ' classifier.')

    train_data = data_loader.load_train_data().sample(
        frac=1, random_state=42).reset_index(drop=True)
    test_data = data_loader.load_test_data()

    if c_name == 'svm':
        classifier = LinearSVC(random_state=42)
        param_grid = {
            'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
            'classifier__C': [0.1, 1]
        }
    else:
        classifier = MLPClassifier((50, ),
                                   solver='lbfgs',
                                   learning_rate_init=1e-4,
                                   tol=1e-6,
                                   max_iter=200,
                                   random_state=42)
        param_grid = {'vectorizer__ngram_range': [(1, 1)]}

    pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', classifier)])

    cv_grid = GridSearchCV(pipe,
                           n_jobs=2,
                           cv=5,
                           verbose=3,
                           param_grid=param_grid)

    start_time = time.time()
    cv_grid.fit(train_data.text, train_data.sentiment)
    end_time = time.time()
    print('Total fit time: {}'.format(end_time - start_time))

    # Classification report
    pred = cv_grid.predict(train_data.text)
    cr = classification_report(train_data.sentiment, pred)
    print(cr)

    # Test predictions
    pred = cv_grid.predict(test_data.text)
    print('Predictions finished.')

    # Save predictions
    results = pd.DataFrame({'Id': test_data.index, 'Prediction': pred})
    results = results.set_index('Id')
    data_loader.save_submission(results,
                                'tfidf_' + c_name.upper() + '_submission.csv')
    print('Predictions saved.')

    # Save classification results
    cvr_path = path.join(
        'pickles', 'tfidf_' + c_name.upper() +
        '_cross_validation_results')  # Cross validation results
    be_path = path.join('pickles', 'tfidf_' + c_name.upper() +
                        '_best_estimator')  # Best estimator

    dump(cv_grid.cv_results_, open(cvr_path, 'wb'))
    dump(cv_grid.best_estimator_, open(be_path, 'wb'))
    print('Classification results saved.')
    if args.eda:  # with EDA
        X_train, Y_train = data_loader.load_train_data_with_EDA(
            scenario=args.scenario)

    elif args.cda:  # with CDA
        pass

    else:  # wihtout any Data Augmentation
        X_train, Y_train = data_loader.load_train_data(args.scenario)

    X_train = X_train.tolist()
    Y_train = Y_train.tolist()  # convert to list
    labels_train = labels_str_to_int(Y_train)  # convert labels to integers

    # Test data:
    X_test, Y_test, test_IDs = data_loader.load_test_data()
    X_test = X_test.tolist()
    Y_test = Y_test.tolist()
    test_IDs = test_IDs.tolist()  # convert to list
    labels_test = labels_str_to_int(Y_test)  # convert labels to integers
    testIDs_idx = np.linspace(
        0, len(test_IDs), len(test_IDs), False
    )  # can't create a tensor of strings, so create a corresponding list of indexes; we use that to index into test_IDs
    print("testIDs indexes:", len(testIDs_idx))

    if args.model == 'bert':
        run_bert()

    elif args.model == 'xlnet':
        run_xlnet()
Beispiel #12
0
def load_test_data_from_file(test_file_path):
    df = load_test_data(test_file_path)
    df.replace(' ?', np.nan, inplace=True)
    df = df.dropna()
    return df
Beispiel #13
0
        elif args.eda and args.cda:  # both EDA and CDA
            train_passages, Y_train = data_loader.load_train_data_with_EDA_and_CDA(
                scenario=args.scenario)

        else:  # wihtout any Data Augmentation
            train_passages, Y_train = data_loader.load_train_data(
                scenario=args.scenario,
                N_WORDS=DOCUMENT_LENGTH,
                exp=experiment)

        print("\nTrain Set ---- X: {} | Y: {} | Distribution: {}".format(
            len(train_passages), len(Y_train), Counter(Y_train)))
        print("Y train preview:", Y_train[:3])

        # Load test data (same for each scenario, with or without augmentation):
        test_passages, Y_test, test_IDs = data_loader.load_test_data(
            N_WORDS=DOCUMENT_LENGTH)
        print(
            "Test Set ---- X: {} | Y: {} | Distribution: {} | Test IDs: {}, preview: {}"
            .format(len(test_passages), len(Y_test), Counter(Y_test),
                    len(test_IDs), test_IDs[:3]))
        print("Y test preview:", Y_test[:3])

        # Sanity check:
        if args.scenario == 'A':
            assert len(train_passages) == len(Y_train) == 401
        else:
            assert len(train_passages) == len(Y_train) == 400

        assert len(test_passages) == len(Y_test) == 198

        prediction_probs = predict(algo)
Beispiel #14
0
# ------------------------------------------------------------------------------
# top settings
# ------------------------------------------------------------------------------
n_training_data = 10000
n_test_data = 1000

n_epoch = 10
mini_batch_size = 100
learn_rate = 0.005

# ------------------------------------------------------------------------------
# step 1: generate data 
# ------------------------------------------------------------------------------
training_data           = data_loader.load_training_data(n_training_data)
test_dataset,test_label = data_loader.load_test_data    (n_test_data    )

# ------------------------------------------------------------------------------
# step 2: setup the model
# ------------------------------------------------------------------------------
label_x = tf.placeholder(tf.float32, [None, 784])
label_y = tf.placeholder(tf.float32, [None,  10])

y_pred=add_layer('output_layer', label_x, 784, 10, tf.nn.softmax)
'''
W = tf.Variable(tf.random_normal([784, 10])); # [1, 784] x [784, 10] = [1, 10]
b = tf.Variable(tf.zeros([10]))
z_pred = tf.matmul(label_x, W) + b
y_pred = tf.nn.softmax(z_pred)
'''
# ------------------------------------------------------------------------------
imgs_train, imgs_mask_train = DataLoader()
imgs_train = preprocess(imgs_train)
imgs_mask_train = preprocess(imgs_mask_train)
#imgs_train = imgs_train.astype('float32')
mean = np.mean(imgs_train)  # mean for data centering
std = np.std(imgs_train)  # std for data normalization

model = get_unet()
model_checkpoint = ModelCheckpoint('weights.h5',
                                   monitor='loss',
                                   save_best_only=True)

print('-' * 30)
print('Loading and preprocessing test data...')
print('-' * 30)
imgs_test, imgs_id_test = load_test_data()
imgs_test = preprocess(imgs_test)

imgs_test = imgs_test.astype('float32')
imgs_test -= mean
imgs_test /= std

print('-' * 30)
print('Loading saved weights...')
print('-' * 30)
model.load_weights('weights.h5')

print('-' * 30)
print('Predicting masks on test data...')
print('-' * 30)
imgs_mask_test = model.predict(imgs_test, verbose=1)
Beispiel #16
0
def main():

    model = enc_dec(params_dict, DEVICE)
    model = model.to(DEVICE)

    ###################### Debugging #############################
    # optimizer = optim.Adam(model.parameters(), lr=LR)
    # optimizer.zero_grad()

    # data = torch.randn(BATCH_SIZE, TIME_STEPS, INPUT_DIM).to(DEVICE)
    # wt_batch = torch.randn(BATCH_SIZE, TIME_STEPS, INPUT_DIM).to(DEVICE)

    # enc_reps, dec_reps = model(data, False)
    # print("Encoded: ")
    # for rep in enc_reps:
    # 	print(rep.shape, torch.max(rep).item(), torch.min(rep).item())
    # print("Decoded: ")
    # for i in range(len(dec_reps)):
    # 	print(i)
    # 	for rep in dec_reps[i]:
    # 		print(rep.shape, torch.max(rep).item(), torch.min(rep).item())

    # loss_dict = all_losses(enc_reps, dec_reps[-1], True, True)
    # for key in loss_dict:
    # 	print(key, loss_dict[key].item())
    # loss = loss_dict[FINAL_CRTR]

    # loss.backward()
    # optimizer.step()
    # return

    # assert(False)
    ###################### Debugging #############################

    if (TRAIN_MODE):
        start = time.time()
        ################## Prepare Data ######################
        print('Prepare data...')

        test_set = load_test_data(INPUT_DIM, TIME_STEPS)

        train_loader = get_data_loader('train_length.csv', INPUT_DIM,
                                       TIME_STEPS, BATCH_SIZE, True)
        valid_loader = get_data_loader('valid_length.csv', INPUT_DIM,
                                       TIME_STEPS, BATCH_SIZE, False)

        time_elapsed = time.time() - start
        print("Getting data takes %fs." % time_elapsed)
        print("#####################################")
        print("Training...")
        if (RESUME and os.path.exists(LAST_WEIGTHS_PATH)):

            states = torch.load(LAST_WEIGTHS_PATH)
            start_epoch = states['epoch']
            start_level = states['level']

            model.load_state_dict(states['state_dict'])
            optimizer = init_optimizer(model, start_level)
            optimizer.load_state_dict(states['optimizer'])

            print('Resume from level', start_level, end=' ')
            print('epoch %d' % start_epoch)
        else:
            print("Start fresh run.")
            start_level = None
            start_epoch = 0
            optimizer = init_optimizer(model, start_level)
            if (INIT_WEIGHTS != None):
                print('Initial weights: ', INIT_WEIGHTS)
                states = torch.load(INIT_WEIGHTS)
                model.load_state_dict(states['state_dict'])
                optimizer.load_state_dict(states['optimizer'])

        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
        min_criterion_vals, best_epoch, best_epochs = train(
            model, optimizer, scheduler, train_loader, valid_loader, test_set,
            start_level, start_epoch)

        # Save results in a csv file
        if (STORE_RESULTS):
            write_results(CSVFILEPATH, min_criterion_vals, best_epoch,
                          best_epochs)

    else:
        if (WEIGHTS_PATH != None and os.path.exists(WEIGHTS_PATH)):
            print(
                'Reconstruct/Represenation Learning by using model weights at:'
            )
            print(WEIGHTS_PATH)
            states = torch.load(WEIGHTS_PATH)
            model.load_state_dict(states['state_dict'])
        else:
            print('Caution: Weights path does not exist', WEIGHTS_PATH)
            return

        if (TEST_MODE):
            print("Reconstructing...")
            test_set = load_test_data(INPUT_DIM, TIME_STEPS)
            test(model, test_set, 'pred.wav')
        if (LEARN_MODE):
            repr_store_path = CONFIG_DIR + 'repr/'
            rep_learning(model, repr_store_path)
            for fold in [train_fold, valid_fold, test_fold]:
                store_rep_for_mfn(fold)
Beispiel #17
0

def _test_svm():
    X = [[0, 0],[1, 1]]
    y = [0, 1]
    model = svm.SVC(gamma="scale")
    model = model.fit(X, y)
    print(model.support_vectors_)

def _test_one_against_one():
    X = [[0, 0], [1, 1], [2, 2], [3, 3]]
    y = [1, 2, 3, 4]
    model = svm.SVC(gamma="scale", decision_function_shape="ovr")
    model = model.fit(X, y)
    print(model.support_vectors_)
    print(model.predict([[-0.1, -0.1], [1.1, 1.1], [2.1, 2.1], [4, 4]]))


def training():
    traing_data = data_loader.load_training_data()
    model = svm.SVC(gamma="scale", decision_function_shape="ovo").fit(traing_data[0], traing_data[1])
    return model


if __name__ == "__main__":
    # _test_svm()
    # _test_one_against_one()
    model = training()
    test_data = data_loader.load_test_data()
    predicts = model.predict(test_data[0])
    print(len([tup for tup in zip(test_data[1], predicts) if tup[0] == tup[1]]) / len(test_data[1]))
Beispiel #18
0
TRAIN_DIRS = [os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'data', 'stage1_train'),
             os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'extra_data'),
             os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage1_test',\
                          'DSB2018_stage1_test-master', 'stage1_test')]
TEST_DIR = os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage2_test')
IMG_DIR_NAME = 'images'
MASK_DIR_NAME = 'masks'

train_df = read_train_data_properties(TRAIN_DIRS, IMG_DIR_NAME, MASK_DIR_NAME)
test_df = read_test_data_properties(TEST_DIR, IMG_DIR_NAME)

x_train, y_train, contour_train, no_contour_train = load_train_data(train_df)
y_train_full = np.array([
    np.concatenate((x, y, z), axis=2)
    for x, y, z in zip(y_train, contour_train, no_contour_train)
])
labels_train = get_train_labels(train_df)

x_test = load_test_data(test_df)

model_paths = train(train_df, y_train_full, labels_train)
y_prediction = inference(x_test, model_paths)
y_test_rle, y_test_ids = get_rle_encoding(test_df, y_prediction)

sub = pd.DataFrame()
sub['ImageId'] = y_test_ids
sub['EncodedPixels'] = pd.Series(y_test_rle).apply(
    lambda x: ' '.join(str(y) for y in x))
sub.to_csv('sub-dsbowl2018.csv', index=False)
sub.head()
Beispiel #19
0
def test_from_beams(data_testset,
                    beams_dir,
                    predict_only=True,
                    sample_best=False):
    test_source_file = os.path.join(config.DATA_DIR, 'test_source_dict.json')
    test_target_file = os.path.join(config.DATA_DIR, 'test_target.txt')
    predictions_final_file = os.path.join(config.PREDICTIONS_DIR,
                                          'predictions_final.txt')
    predictions_reduced_file = os.path.join(config.METRICS_DIR,
                                            'predictions_reduced.txt')
    test_reference_file = os.path.join(config.METRICS_DIR,
                                       'test_references.txt')

    print('Loading test data...', end=' ')
    sys.stdout.flush()

    # Load and preprocess the test data
    data_loader.load_test_data(data_testset)

    print('DONE')
    print('Extracting beams...')
    sys.stdout.flush()

    # Read all beam files in the given beams folder
    beam_files = glob.glob(os.path.join(beams_dir, '*.txt'))

    print('-> Beam files found:')
    print('\n'.join(beam_files))

    # Combine all beam files into a single DataFrame
    df_beams = pd.concat(
        (pd.read_csv(f, sep='\t', header=None, encoding='utf8')
         for f in beam_files),
        axis=1,
        ignore_index=True)
    assert len(df_beams.columns) > 1

    # Combine beams and their corresponding scores into tuples
    beams = []
    for i in range(0, len(df_beams.columns), 2):
        beams.append(list(zip(df_beams.iloc[:, i], df_beams.iloc[:, i + 1])))

    # Transpose the list of beams so as to have all beams of a single sample per line
    beams = list(map(list, zip(*beams)))

    print('DONE')
    print('Reranking...')
    sys.stdout.flush()

    # Score the slot alignment in the beams, and rerank the beams accordingly
    if sample_best:
        beams = postprocessing.rerank_beams(beams,
                                            keep_n=10,
                                            keep_least_errors_only=True)
    else:
        beams = postprocessing.rerank_beams(beams, keep_n=10)

    print('DONE')
    print('Evaluating...')
    sys.stdout.flush()

    with io.open(test_source_file, 'r', encoding='utf8') as f_test_source, \
            io.open(predictions_final_file, 'w', encoding='utf8') as f_predictions_final:

        mrs = json.load(f_test_source, object_pairs_hook=OrderedDict)

        if sample_best:
            predictions = [
                random.choice(prediction_beams)[0]
                for prediction_beams in beams
            ]
        else:
            predictions = [
                prediction_beams[0][0] for prediction_beams in beams
            ]

        # Post-process the generated utterances
        predictions_final = postprocessing.finalize_utterances(
            predictions, mrs)

        for prediction in predictions_final:
            f_predictions_final.write(prediction + '\n')

        if not predict_only:
            # Create a file with a single prediction for each group of the same MRs
            if 'rest_e2e' in data_testset:
                test_mrs, _ = data_loader.read_rest_e2e_dataset_test(
                    data_testset)
            elif 'tv' in data_testset:
                test_mrs, _, _ = data_loader.read_tv_dataset_test(data_testset)
            elif 'laptop' in data_testset:
                test_mrs, _, _ = data_loader.read_laptop_dataset_test(
                    data_testset)
            elif 'hotel' in data_testset:
                test_mrs, _, _ = data_loader.read_hotel_dataset_test(
                    data_testset)
            elif 'video_game' in data_testset:
                test_mrs, _ = data_loader.read_video_game_dataset_test(
                    data_testset)
            else:
                raise FileNotFoundError

            with io.open(predictions_reduced_file, 'w',
                         encoding='utf8') as f_predictions_reduced:
                for i in range(len(test_mrs)):
                    if i == 0 or test_mrs[i] != test_mrs[i - 1]:
                        f_predictions_reduced.write(predictions_final[i] +
                                                    '\n')

    if not predict_only:
        # Depending on the OS, the tensor2tensor BLEU script might require a different way of executing
        if sys.executable is not None:
            bleu_script = 'python ' + os.path.join(
                os.path.dirname(sys.executable), 't2t-bleu')
        else:
            bleu_script = 't2t-bleu'

        metrics_script = 'python ' + os.path.join(config.METRICS_DIR,
                                                  'measure_scores.py')

        # Run the tensor2tensor internal BLEU script
        os.system(bleu_script + ' --translation=' + predictions_final_file +
                  ' --reference=' + test_target_file)

        # Run the metrics script provided by the E2E NLG Challenge
        os.system(metrics_script + ' ' + test_reference_file + ' ' +
                  predictions_reduced_file)

    print('DONE')
Beispiel #20
0
import numpy as np
from nltk.corpus import stopwords
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.stem.wordnet import WordNetLemmatizer
import data_loader
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# %matplotlib inline
plt.set_cmap('RdYlBu')
import pre_processing

train, valid = data_loader.load_train_data('data/train.csv')
test = data_loader.load_test_data('data/test.csv', 'data/test_labels.csv')

list_classes = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]

train_y = train[list_classes].values
valid_y = valid[list_classes].values
test_y = test[list_classes].values

train = train.fillna('')
valid = valid.fillna('')
test = test.fillna('')
"""## Data Exploration"""

print(train.shape)
Beispiel #21
0
def train(g, d, train_loader, neg_loader, epoches, g_optim, d_optim, neg_lens):
    g = g.to(device)
    d = d.to(device)
    time.sleep(0.1)
    print("start training on {}".format(device))
    time.sleep(0.1)
    bce_loss = torch.nn.BCELoss()
    # 训练判别器D
    for e in tqdm(range(epoches)):
        start_time = time.time()
        idx = 0
        d_loss = 0.0
        neg_iter = neg_loader.__iter__()
        # 训练判别器d
        for _, _, real_attr, real_user_emb in train_loader:
            if idx > neg_lens:
                break
            _, _, neg_attr, neg_user_emb = neg_iter.next()
            # 正例的属性和用户嵌入
            real_attr = real_attr.to(device)
            real_user_emb = real_user_emb.to(device)
            # 负例的属性和用户嵌入
            neg_attr = neg_attr.to(device)
            neg_user_emb = neg_user_emb.to(device)
            # 生成器生成虚拟用户嵌入
            fake_user_emb = g(real_attr)
            fake_user_emb = fake_user_emb.to(device)
            # 判别器判别
            d_real, d_logit_real = d(real_attr, real_user_emb)
            d_fake, d_logit_fake = d(real_attr, fake_user_emb)
            d_neg, d_logit_neg = d(neg_attr, neg_user_emb)
            # 计算d_loss
            d_optim.zero_grad()
            d_loss_real = bce_loss(d_real, torch.ones_like(d_real))
            d_loss_fake = bce_loss(d_fake, torch.zeros_like(d_fake))
            d_loss_neg = bce_loss(d_neg, torch.zeros_like(d_neg))
            d_loss = torch.mean(d_loss_real + d_loss_fake + d_loss_neg)
            d_loss.backward()
            d_optim.step()
            idx += batch_size
        # 训练生成器g
        g_loss = 0.0
        for uid, mid, attr, user_emb in train_loader:
            g_optim.zero_grad()
            attr = attr.to(device)
            # 生成虚拟用户嵌入
            fake_user_emb = g(attr)
            fake_user_emb = fake_user_emb.to(device)
            # 算loss
            d_fake, d_logit_fake = d(attr, fake_user_emb)
            g_loss = bce_loss(d_fake, torch.ones_like(d_fake))
            g_loss.backward()
            g_optim.step()
        end_time = time.time()
        print("\nepoch:{}: time:{:.2f}, d_loss:{:.3f}, g_loss:{:.3f}".format(
            e + 1, end_time - start_time, d_loss, g_loss))
        # test
        test_item, test_attribute = data_loader.load_test_data()
        test_item = torch.tensor(test_item).to(device)
        test_attribute = torch.tensor(test_attribute,
                                      dtype=torch.long).to(device)
        fake_user = g(test_attribute)
        eval.test(fake_user.cpu().detach().numpy())
        time.sleep(0.1)
Beispiel #22
0
            leven_cost += max(i2 - i1, j2 - j1)
        elif tag == 'insert':
            leven_cost += (j2 - j1)
        elif tag == 'delete':
            leven_cost += (i2 - i1)
    return leven_cost


def defaultdict_from_dict(dic):
    dd = defaultdict(int)
    dd.update(dic)
    return dd


#  加载数据
test_pny_list, test_han_list = load_test_data()

# 1.声学模型-----------------------------------


# 2.语言模型-------------------------------------------

with open('vocab/pny_vocab.json', "r", encoding='utf-8') as f:
    pny_dict_w2id = json.load(f)
    pny_dict_w2id = defaultdict_from_dict(pny_dict_w2id)
pny_dict_id2w = {v: k for k, v in pny_dict_w2id.items()}

with open('vocab/han_vocab.json', "r", encoding='utf-8') as f:
    han_dict_w2id = json.load(f)
    han_dict_w2id = defaultdict_from_dict(han_dict_w2id)
han_dict_id2w = {v: k for k, v in han_dict_w2id.items()}
Beispiel #23
0
def main(nb_epoch=1,
         data_augmentation=True,
         noise=True,
         maxout=True,
         dropout=True,
         l1_reg=False,
         l2_reg=True):
    # l1 and l2 regularization shouldn't be true in the same time
    if l1_reg and l2_reg:
        print("No need to run l1 and l2 regularization in the same time")
        quit()
    # print settings for this experiment
    print("number of epoch: {0}".format(nb_epoch))
    print("data augmentation: {0}".format(data_augmentation))
    print("noise: {0}".format(noise))
    print("maxout: {0}".format(maxout))
    print("dropout: {0}".format(dropout))
    print("l1: {0}".format(l1_reg))
    print("l2: {0}".format(l2_reg))
    # the data, shuffled and split between train and test sets
    (X_train, y_train), (X_test, y_test) = cifar10.load_data()
    # split the validation dataset
    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=0)
    print('X_train shape:', X_train.shape)
    print(X_train.shape[0], 'train samples')
    print(X_valid.shape[0], 'valid samples')
    print(X_test.shape[0], 'test samples')

    # convert class vectors to binary class matrices
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_valid = np_utils.to_categorical(y_valid, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    X_train = X_train.astype('float32')
    X_valid = X_valid.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_valid /= 255
    X_test /= 255

    ##### try loading data using data_loader.py ####
    data_loader.download_and_extract(data_path, data_url)
    class_names = data_loader.load_class_names()
    print(class_names)
    images_train, cls_train, labels_train = data_loader.load_training_data()
    images_test, cls_test, labels_test = data_loader.load_test_data()
    X_train, Y_train = images_train, labels_train
    X_test, Y_test = images_test, labels_test
    X_train, X_valid, Y_train, Y_valid = train_test_split(X_train,
                                                          Y_train,
                                                          test_size=0.2,
                                                          random_state=0)
    print("Size of:")
    print("- Training-set:\t\t{}".format(len(X_train)))
    print("- Validation-set:\t\t{}".format(len(X_valid)))
    print("- Test-set:\t\t{}".format(len(X_test)))

    model = Sequential()
    if noise:
        model.add(
            GaussianNoise(sigma,
                          input_shape=(img_channels, img_rows, img_cols)))
    model.add(
        Convolution2D(32,
                      3,
                      3,
                      border_mode='same',
                      input_shape=(img_channels, img_rows, img_cols)))
    model.add(Activation('relu'))
    model.add(Convolution2D(32, 3, 3))
    model.add(Activation('relu'))
    #    model.add(MaxPooling2D(pool_size=(2, 2)))
    if dropout:
        model.add(Dropout(0.25))

    model.add(Convolution2D(64, 3, 3, border_mode='same'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 3, 3))
    model.add(Activation('relu'))
    #   model.add(MaxPooling2D(pool_size=(2, 2)))
    if dropout:
        model.add(Dropout(0.25))

    model.add(Flatten())
    if maxout:
        model.add(MaxoutDense(512, nb_feature=4, init='glorot_uniform'))
    else:
        if not (l1_reg or l2_reg):
            model.add(Dense(512))
        # activation regularization not implemented yet
        if l1_reg:
            model.add(Dense(512, W_regularizer=l1(l1_weight)))
        elif l2_reg:
            model.add(Dense(512, W_regularizer=l2(l2_weight)))

    model.add(Activation('relu'))
    if dropout:
        model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    # let's train the model using SGD + momentum (how original).
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    start_time = time.time()
    if not data_augmentation:
        his = model.fit(X_train,
                        Y_train,
                        batch_size=batch_size,
                        nb_epoch=nb_epoch,
                        validation_data=(X_valid, Y_valid),
                        shuffle=True)
    else:
        # this will do preprocessing and realtime data augmentation
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=
            False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=True,  # apply ZCA whitening
            rotation_range=
            0,  # randomly rotate images in the range (degrees, 0 to 180)
            width_shift_range=
            0.1,  # randomly shift images horizontally (fraction of total width)
            height_shift_range=
            0.1,  # randomly shift images vertically (fraction of total height)
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # compute quantities required for featurewise normalization
        # (std, mean, and principal components if ZCA whitening is applied)
        datagen.fit(X_train)

        # fit the model on the batches generated by datagen.flow()
        his = model.fit_generator(datagen.flow(X_train,
                                               Y_train,
                                               batch_size=batch_size),
                                  samples_per_epoch=X_train.shape[0],
                                  nb_epoch=nb_epoch,
                                  validation_data=(X_valid, Y_valid))

    # evaluate our model
    score = model.evaluate(X_test, Y_test, verbose=0)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
    print('training time', time.time() - start_time)

    # wirte test accuracy to a file

    output_file_name = './output_l1l2/train_val_loss_with_dropout_epochs_{0}_data_augmentation_{1}_noise_{2}_maxout_{3}_dropout_{4}_l1_{5}_l2_{6}_sigma_{7}_l1weight_{8}_l2weight_{9}.txt'.format(
        nb_epoch, data_augmentation, noise, maxout, dropout, l1_reg, l2_reg,
        sigma, l1_weight, l2_weight)
    print(output_file_name)
    with open(output_file_name, "w") as text_file:
        text_file.write('Test score: {}'.format(score[0]))
        text_file.write('\n')
        text_file.write('Test accuracy: {}'.format(score[1]))
    text_file.close()

    # visualize training history
    train_loss = his.history['loss']
    val_loss = his.history['val_loss']
    plt.plot(range(1,
                   len(train_loss) + 1),
             train_loss,
             color='blue',
             label='train loss')
    plt.plot(range(1,
                   len(val_loss) + 1),
             val_loss,
             color='red',
             label='val loss')
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.xlabel('#epoch')
    plt.ylabel('loss')
    # @TODO what's the deal around here ~"~"?
    output_fig_name = './output_no_maxout/train_val_loss_with_dropout_epochs_{0}_data_augmentation_{1}_noise_{2}_maxout_{3}_dropout_{4}_l1_{5}_l2_{6}_sigma_{7}_l1weight_{8}_l2weight_{9}.png'.format(
        nb_epoch, data_augmentation, noise, maxout, dropout, l1_reg, l2_reg,
        sigma, l1_weight, l2_weight)
    plt.savefig(output_fig_name, dpi=300)
    plt.show()
Beispiel #24
0
def test(data_testset, predict_only=True, reranking=True):
    test_source_file = os.path.join(config.DATA_DIR, 'test_source_dict.json')
    test_target_file = os.path.join(config.DATA_DIR, 'test_target.txt')
    predictions_file = os.path.join(config.PREDICTIONS_DIR, 'predictions.txt')
    predictions_final_file = os.path.join(config.PREDICTIONS_DIR,
                                          'predictions_final.txt')
    predictions_reduced_file = os.path.join(config.METRICS_DIR,
                                            'predictions_reduced.txt')
    test_reference_file = os.path.join(config.METRICS_DIR,
                                       'test_references.txt')

    print('Loading test data...', end=' ')
    sys.stdout.flush()

    # Load and preprocess the test data
    data_loader.load_test_data(data_testset)

    print('DONE')
    print('Predicting...')
    sys.stdout.flush()

    # TODO: set DECODE_FILE and PREDICTION_FILE environment variables from here instead of the shell script

    # Run inference for the test samples
    os.system('bash ' + os.path.join(config.T2T_DIR, 't2t_test_script.sh'))

    print('DONE')
    print('Extracting beams...')
    sys.stdout.flush()

    # Read in the beams and their log-probs as produced by the T2T beam search
    df_predictions = pd.read_csv(predictions_file,
                                 sep='\t',
                                 header=None,
                                 encoding='utf8')
    beams_present = len(df_predictions.columns) > 1

    if beams_present:
        # Combine beams and their corresponding scores into tuples
        beams = []
        for i in range(0, len(df_predictions.columns), 2):
            beams.append(
                list(
                    zip(df_predictions.iloc[:, i],
                        df_predictions.iloc[:, i + 1])))

        # Transpose the list of beams so as to have all beams of a single sample per line
        beams = list(map(list, zip(*beams)))
    else:
        beams = [[(beam, )] for beam in df_predictions.iloc[:, 0].tolist()]

    print('DONE')
    print('Reranking...')
    sys.stdout.flush()

    # Score the slot alignment in the beams, and rerank the beams accordingly
    if reranking and beams_present:
        beams = postprocessing.rerank_beams(beams)

    print('DONE')
    print('Evaluating...')
    sys.stdout.flush()

    with io.open(test_source_file, 'r', encoding='utf8') as f_test_source, \
            io.open(predictions_final_file, 'w', encoding='utf8') as f_predictions_final:

        mrs = json.load(f_test_source, object_pairs_hook=OrderedDict)
        predictions = [prediction_beams[0][0] for prediction_beams in beams]
        predictions_final = postprocessing.finalize_utterances(
            predictions, mrs)

        for prediction in predictions_final:
            f_predictions_final.write(prediction + '\n')

        if not predict_only:
            # Create a file with a single prediction for each group of the same MRs
            if 'rest_e2e' in data_testset:
                test_mrs, _ = data_loader.read_rest_e2e_dataset_test(
                    data_testset)
            elif 'tv' in data_testset:
                test_mrs, _, _ = data_loader.read_tv_dataset_test(data_testset)
            elif 'laptop' in data_testset:
                test_mrs, _, _ = data_loader.read_laptop_dataset_test(
                    data_testset)
            elif 'hotel' in data_testset:
                test_mrs, _, _ = data_loader.read_hotel_dataset_test(
                    data_testset)
            elif 'video_game' in data_testset:
                test_mrs, _ = data_loader.read_video_game_dataset_test(
                    data_testset)
            else:
                raise FileNotFoundError

            with io.open(predictions_reduced_file, 'w',
                         encoding='utf8') as f_predictions_reduced:
                for i in range(len(test_mrs)):
                    if i == 0 or test_mrs[i] != test_mrs[i - 1]:
                        f_predictions_reduced.write(predictions_final[i] +
                                                    '\n')

    if not predict_only:
        # Depending on the OS, the tensor2tensor BLEU script might require a different way of executing
        if sys.executable is not None:
            bleu_script = 'python ' + os.path.join(
                os.path.dirname(sys.executable), 't2t-bleu')
        else:
            bleu_script = 't2t-bleu'

        metrics_script = 'python ' + os.path.join(config.METRICS_DIR,
                                                  'measure_scores.py')

        # Run the tensor2tensor internal BLEU script
        os.system(bleu_script + ' --translation=' + predictions_final_file +
                  ' --reference=' + test_target_file)

        # Run the metrics script provided by the E2E NLG Challenge
        os.system(metrics_script + ' ' + test_reference_file + ' ' +
                  predictions_reduced_file)

    print('DONE')
Beispiel #25
0
def main(options):
    train_data_dir = options.train_data_dir
    train_files = os.listdir(train_data_dir)

    test_data_dir = options.test_data_dir
    test_files = os.listdir(test_data_dir)

    data = data_loader.load_training_data(train_data_dir, train_files)
    images, labels, label_dict, imsize = data

    test_data = data_loader.load_test_data(test_data_dir, test_files, label_dict)
    test_images, test_labels = test_data

    num_classes = len(label_dict)
    test_info = test_images, test_labels
    has_test_data = False
    if len(test_images) > 0:
        has_test_data = True

    for key in label_dict:
        ex_count = np.sum(labels[:, label_dict[key]] == 1.)
        print("Number of examples of " + str(key) + ": " + str(ex_count))

    print("Feature Mapping: ", label_dict)
    print("Image Size: " + str(imsize))

    train_info, val_info = data_loader.merge_and_split_data(images, labels)
    train_data, train_labels = train_info
    val_data, val_labels = val_info

    print("Merged and Split Training Data")
    print("Training Data Size: ", len(train_data))
    print("Validation Data Size: ", len(val_data))

    if has_test_data:
        for key in label_dict:
            ex_count = np.sum(test_labels[:, label_dict[key]] == 1.)
            print("Number of test examples of " + str(key) + ": " + str(ex_count))
        print("Test Data Size: ", len(test_images))

    data_queue = data_queues.QueueManager(train_data, train_labels)

    placeholders = utils.create_placeholders(imsize, num_classes)
    indata, answer, is_training, keep_prob, learning_rate = placeholders

    runnables = vgg_encoder_model.setup_model(indata,
                                              answer,
                                              imsize,
                                              is_training,
                                              keep_prob,
                                              num_classes,
                                              learning_rate)

    train_step, loss, predictions, accuracy, summaries, fc_2 = runnables
    train_summary, val_summary, test_summary = summaries
    init = tf.global_variables_initializer()
    print("Setup Model")

    log_dir = options.tb_log_dir
    features_dir = options.features_dir

    with tf.device('/gpu:0'):
        with tf.Session() as sess:

            train_writer = tf.summary.FileWriter(log_dir + "/train", sess.graph)
            val_writer = tf.summary.FileWriter(log_dir + "/val", sess.graph)
            test_writer = tf.summary.FileWriter(log_dir + "/test", sess.graph)
            sess.run(init)
            patience = 0
            max_patience = 1000  # wait for 30 steps of val loss not decreasing
            min_val_loss = float("inf")
            epoch = 0
            while patience < max_patience:
                epoch += 1
                print("EPOCH: ", str(epoch))
                start = time.time()
                train_statistics = model_runner.process_train_data(data_queue,
                                                                   placeholders,
                                                                   runnables,
                                                                   train_writer,
                                                                   num_classes,
                                                                   sess)

                avg_train_loss, train_acc = train_statistics[:2]
                train_confusion_matrix = train_statistics[2]
                train_misclassified = train_statistics[3]
                end = time.time()

                val_statistics = model_runner.process_data(val_info,
                                                           placeholders,
                                                           runnables,
                                                           val_writer,
                                                           num_classes,
                                                           False,
                                                           sess)

                avg_val_loss, val_acc = val_statistics[:2]
                val_confusion_matrix = val_statistics[2]
                val_misclassified = val_statistics[3]
                val_features = val_statistics[4]

                if has_test_data:
                    test_statistics = model_runner.process_data(test_info,
                                                                placeholders,
                                                                runnables,
                                                                test_writer,
                                                                num_classes,
                                                                True,
                                                                sess)

                    avg_test_loss, test_acc = test_statistics[:2]
                    test_confusion_matrix = test_statistics[2]
                    test_misclassified = test_statistics[3]
                    test_features = test_statistics[4]

                    print("train_loss: " + str(avg_train_loss) + " val_loss: " + str(avg_val_loss) +
                          " test_loss: "  + str(avg_test_loss))
                else:
                    print("train_loss: " + str(avg_train_loss) + " val_loss: " + str(avg_val_loss))

                print("train_acc: " + str(train_acc))
                print("val_acc: " + str(val_acc))
                if has_test_data:
                    print("test_acc: ", str(test_acc))
                print("Training Time: ", str(end - start))

                if avg_val_loss < min_val_loss:
                    min_val_loss = avg_val_loss
                    print("Training Confusion:\n", train_confusion_matrix)
                    print("Validation Confusion:\n", val_confusion_matrix)
                    if has_test_data:
                        print("Test Confusion:\n", test_confusion_matrix)

                        utils.store_misclassified(train_misclassified,
                                                  val_misclassified,
                                                  test_misclassified=test_misclassified)
                    else:
                        utils.store_misclassified(train_misclassified,
                                                  val_misclassified)

                    pickle_name = [k + "_{}".format(v) for k, v in label_dict.items()]
                    pickle_name = '_'.join(pickle_name)
                    pickle_name = pickle_name + ".p"
                    final_features = {'val': val_features}
                    if has_test_data:
                      final_features['test'] = test_features
                    with open(os.path.join(features_dir, pickle_name), 'wb') as outf:
                      pickle.dump(final_features, outf, protocol=pickle.HIGHEST_PROTOCOL)

                    patience = 0
                else:
                    patience += 1
Beispiel #26
0
def test_all(data_testset, reranking=True):
    test_source_file = os.path.join(config.DATA_DIR, 'test_source_dict.json')
    test_target_file = os.path.join(config.DATA_DIR, 'test_target.txt')

    # Prepare the output folder
    if not os.path.exists(config.PREDICTIONS_BATCH_LEX_DIR):
        os.makedirs(config.PREDICTIONS_BATCH_LEX_DIR)

    print('Loading test data...', end=' ')
    sys.stdout.flush()

    # Load and preprocess the test data
    data_loader.load_test_data(data_testset)

    print('DONE')
    print('Predicting...')
    sys.stdout.flush()

    # Run inference for the test samples using each checkpoint of the model
    os.system('bash ' + os.path.join(config.T2T_DIR, 't2t_test_all_script.sh'))

    print('DONE')
    print('Evaluating...')
    sys.stdout.flush()

    # Relexicalize all prediction files
    for predictions_file in glob.glob(
            os.path.join(config.PREDICTIONS_BATCH_DIR, '*')):
        predictions_final_file = os.path.join(
            config.PREDICTIONS_BATCH_LEX_DIR,
            os.path.basename(predictions_file))

        # Read in the beams and their log-probs as produced by the T2T beam search
        df_predictions = pd.read_csv(predictions_file,
                                     sep='\t',
                                     header=None,
                                     encoding='utf8')
        beams_present = len(df_predictions.columns) > 1

        if beams_present:
            # Combine beams and their corresponding scores into tuples
            beams = []
            for i in range(0, len(df_predictions.columns), 2):
                beams.append(
                    list(
                        zip(df_predictions.iloc[:, i],
                            df_predictions.iloc[:, i + 1])))

            # Transpose the list of beams so as to have all beams of a single sample per line
            beams = list(map(list, zip(*beams)))
        else:
            beams = [[(beam, )] for beam in df_predictions.iloc[:, 0].tolist()]

        # Score the slot alignment in the beams, and rerank the beams accordingly
        if reranking and beams_present:
            beams = postprocessing.rerank_beams(beams)

        # Postprocess the generated utterances and save them to a new file
        with io.open(test_source_file, 'r', encoding='utf8') as f_test_source, \
                io.open(predictions_final_file, 'w', encoding='utf8') as f_predictions_final:

            mrs = json.load(f_test_source, object_pairs_hook=OrderedDict)
            predictions = [
                prediction_beams[0][0] for prediction_beams in beams
            ]
            predictions_final = postprocessing.finalize_utterances(
                predictions, mrs)

            for prediction in predictions_final:
                f_predictions_final.write(prediction + '\n')

    # Depending on the OS, the tensor2tensor BLEU script might require a different way of executing
    if sys.executable is not None:
        bleu_script = 'python ' + os.path.join(os.path.dirname(sys.executable),
                                               't2t-bleu')
    else:
        bleu_script = 't2t-bleu'

    # Run the tensor2tensor internal BLEU script
    os.system(bleu_script + ' --translations_dir=' +
              config.PREDICTIONS_BATCH_LEX_DIR + ' --reference=' +
              test_target_file + ' --event_dir=' +
              config.PREDICTIONS_BATCH_EVENT_DIR)

    print('DONE')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
# load data (provided method)
train_data, valid_data = data_loader.load_train_data('Data/adult.data',
                                                     valid_rate=0.1,
                                                     is_df=True)
test_data = data_loader.load_test_data('Data/adult.test', is_df=True)

#update fields
native_country_dict = {
    ' ?': '?',
    ' Cambodia': 'Africa',
    ' Canada': 'North America',
    ' China': 'Asia',
    ' Columbia': 'Latin America',
    ' Cuba': 'Latin America',
    ' Dominican-Republic': 'Latin America',
    ' Ecuador': 'Latin America',
    ' El-Salvador': 'Latin America',
    ' England': 'Europe',
    ' France': 'Europe',
    ' Germany': 'Europe',
import augmentation_methods as am
import data_loader as dl
import word_vectors as wv  
import data_preprocessing as dp 
import classifier as cl   
import testing as t
import visualization as vis 

if __name__ == "__main__":
    # get original data in tokenized form
    orig_corpus, y_train_orig = dl.load_train_data()
    test_corpus, y_test_orig = dl.load_test_data()

    # develop word vectors
    word_vectors = wv.get_word_vectors(orig_corpus)

    # augment corpi
    corpus_method_1, y_train_method_1 = am.method_1(orig_corpus.copy(), y_train_orig.copy(), word_vectors)
    corpus_method_2, y_train_method_2 = am.method_2(orig_corpus.copy(), y_train_orig.copy(), word_vectors)
    corpus_method_3, y_train_method_3 = am.method_3(orig_corpus.copy(), y_train_orig.copy(), word_vectors)

    # process data so they are in a form(td-idf) that can be fed to classifiers
    X_orig, vectorizer = dp.process_corpus_orig(orig_corpus)
    X_method_1 = dp.process_corpus(corpus_method_1, vectorizer)
    X_method_2 = dp.process_corpus(corpus_method_2, vectorizer)
    X_method_3 = dp.process_corpus(corpus_method_3, vectorizer)
    X_test = dp.process_corpus(test_corpus, vectorizer)

    # train classifiers on original corpus and all augmented corpi
    classifier_orig = cl.train_classifier_bayes(X_orig, y_train_orig)
    classifier_method_1 = cl.train_classifier_bayes(X_method_1, y_train_method_1)
Beispiel #29
0

def get_embedding_matrix():
    embeddings_index = glove_word_embeddings.load_embeddings_index()
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


# texts, labels = data_loader.load_data()
# Loading test data for test results with loading weights
texts, labels = data_loader.load_test_data()
print("Test data loaded, length: ", len(texts))
print("Test data loaded, length: ", len(labels))

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts=texts)
sequences = tokenizer.texts_to_sequences(texts)

#word_index = tokenizer.word_index
#print("Found %s unique tokens." % len(word_index))

#data = pad_sequences(sequences, maxlen=max_len)

#labels = np.asarray(labels)

#print("Shape of data tensor:", data.shape)