Python prep_datasetの例

プログラミング言語: Python

名前空間/パッケージ名: dl4mt.datasets

メソッド/関数: prep_dataset

hotexamples.comのコード掲載数: 4

Python prep_dataset - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdl4mt.datasets.prep_datasetの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: train_logistic.py プロジェクト: tomekd/dl4mt_exercises

def main():
    """ main """
    DATASET_LOCATION = '../datasets/'

    POS_DATASET_NAME = 'brown_pos_dataset.hdf5'
    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

    WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)

    CUTOFF = 100000

    MODELS_PATH = "../trained_models/"
    MODEL_NAME = "logistic_regression_model.pkl"
    save_path = os.path.join(MODELS_PATH, MODEL_NAME)

    train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'], cutoff=CUTOFF)
    dev_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['dev'], cutoff=CUTOFF)

    # get the functions and params that we need for our models
    initialization_data = initialize_logistic_regression(train_dataset, dev_dataset,
                                                         learning_rate=0.1, batch_size=100)

    classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data

    train_model(train_model_func,  n_train_batches, validate_model=validate_model_func,
                n_valid_batches=n_valid_batches, training_epochs=200)


    save_model(classifier, save_path)

コード例 #2

ファイルを表示

ファイル: theano_autoencoder.py プロジェクト: tomekd/dl4mt_exercises

def main():
    """ main """
    DATASET_LOCATION = '../../datasets/'

    # the pos dataset consists of windows around words
    POS_DATASET_NAME = 'brown_pos_dataset.hdf5'
    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

    CORPUS_INDICES = 'brown_pos_dataset.indices'
    WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'

    # load the training data
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)
    CUTOFF = 10000

    train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH,
                                 which_sets=['train'], cutoff=CUTOFF)

    initialization_data = initialize_dA(train_dataset, learning_rate=0.1,
                                        corruption_level=0.3, batch_size=50,
                                        n_hidden=2)

    classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data

    train_model(train_model_func,  n_train_batches,
                validate_model=validate_model_func,
                n_valid_batches=n_valid_batches, training_epochs=10)

    # make a theano function to get predictions from a trained model
    training_data = theano.tensor.matrix('training_X')
    predictions = classifier.predict(training_data)
    get_predictions = theano.function([training_data], predictions)
    # get predictions and evaluate
    p = get_predictions(train_dataset[0].get_value())


    # get train_y without the cast
    train_y = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'],
                        cast_y=False, cutoff=CUTOFF)[1].get_value().astype('int32')

    CUTOFF_BEGIN=0
    CUTOFF_END=1000

    y_vals = train_y[CUTOFF_BEGIN:CUTOFF_END]
    norm_y_vals = y_vals / float(np.amax(y_vals))

    jitter1 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN)
    jitter2 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN)
    x1 = p[CUTOFF_BEGIN:CUTOFF_END,0] + jitter1
    x2 = p[CUTOFF_BEGIN:CUTOFF_END,1] + jitter2


    plt.scatter(x1, x2, c=norm_y_vals, s=20)
    plt.show()

コード例 #3

ファイルを表示

ファイル: test_mlp.py プロジェクト: tomekd/dl4mt_exercises

def main():
    """ main """
    DATASET_LOCATION = '../datasets/'

    POS_DATASET_NAME = 'brown_pos_dataset.hdf5'

    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)
    WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy'
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)

    CUTOFF = 10000

    MODELS_PATH = "../trained_models/"
    MODEL_NAME = "mlp_model.pkl"
    classifier = os.path.join(MODELS_PATH, MODEL_NAME)
    test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH,
                                which_sets=['test'], cutoff=CUTOFF, cast_y=False)

    test_X, test_y = test_dataset
    test_y = test_y.get_value().astype('int32')
    predictions = predict(classifier, test_X.get_value())

    CORPUS_INDICES = 'brown_pos_dataset.indices'


    with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
        corpus_indices = cPickle.load(indices_file)

    # map tag ids back to strings
    y_test_actual = [corpus_indices['idx2tag'][tag_idx] for tag_idx in test_y]
    y_test_hat = [corpus_indices['idx2tag'][tag_idx] for tag_idx in predictions]

    # Quick Evaluation
    acc = sum([y==p for y,p in zip(predictions, test_y)]) / float(len(predictions))
    print "ACC: {}".format((acc))
    # get class names
    class_names = list(set(y_test_actual))

    # Compute confusion matrix
    cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names)

    # Normalize the confusion matrix by row (i.e by the number of samples in each class)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure()
    plot_confusion_matrix(cm_normalized, class_names, title='Normalized confusion matrix')

    plt.show()

コード例 #4

ファイルを表示

ファイル: test_logistic.py プロジェクト: tomekd/dl4mt_exercises

def main():
    CUTOFF = 100000

    DATASET_LOCATION = "../datasets/"
    POS_DATASET_NAME = "brown_pos_dataset.hdf5"
    POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME)

    WORD_BY_WORD_MATRIX = "brown.word-by-word.normalized.npy"
    VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX)

    MODELS_PATH = "../trained_models/"
    MODEL_NAME = "logistic_regression_model.pkl"
    classifier = os.path.join(MODELS_PATH, MODEL_NAME)

    test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=["test"], cutoff=CUTOFF, cast_y=False)

    test_X, test_y = test_dataset
    test_y = test_y.get_value().astype("int32")
    predictions = predict(classifier, test_X.get_value())

    CORPUS_INDICES = "brown_pos_dataset.indices"
    # Indexes for mapping words and tags <--> ints
    with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file:
        corpus_indices = cPickle.load(indices_file)

    # map tag ids back to strings
    y_test_actual = [corpus_indices["idx2tag"][tag_idx] for tag_idx in test_y]
    y_test_hat = [corpus_indices["idx2tag"][tag_idx] for tag_idx in predictions]

    # quick check of our accuracy on the test dataset
    acc = sum([y == p for y, p in zip(predictions, test_y)]) / float(len(predictions))
    print "ACC: {}".format(acc)

    # get class names
    class_names = list(set(y_test_actual))

    # Compute confusion matrix
    cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names)

    cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
    plt.figure()
    plot_confusion_matrix(cm_normalized, class_names, title="Normalized confusion matrix")

    plt.show()

    f1_scores = sklearn.metrics.f1_score(y_test_actual, y_test_hat, labels=class_names, average=None)
    for class_name, score in zip(class_names, f1_scores):
        print "{}:\t{}".format(class_name, score)