Ejemplos de load_data en Python, ejemplos de preprocessing.load_data en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: cnn.py Proyecto: anilsah895/Floralclassification

def main(unused_argv):

    ROOT_PATH = "."  # Denotes the current working directory
    TRAIN_DATA_DIRECTORY = os.path.join(ROOT_PATH, "/root/leaf_image/DATA/training")
    TEST_DATA_DIRECTORY = os.path.join(ROOT_PATH, "/root/leaf_image/DATA/testing")

    train_data, train_labels = load_data(TRAIN_DATA_DIRECTORY)
    eval_data, eval_labels = load_data(TEST_DATA_DIRECTORY)

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir="./tmp/model")

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=50)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=100,
        hooks=[logging_hook])

    def serving_input_receiver_fn():
        """Build the serving inputs."""

        inputs = {"x": tf.placeholder(shape=[1, DEFAULT_SIZE, DEFAULT_SIZE, 1], dtype=tf.float32)}
        return tf.estimator.export.ServingInputReceiver(inputs, inputs)

    export_dir = mnist_classifier.export_savedmodel(
        export_dir_base="./model_saved/",
        serving_input_receiver_fn=serving_input_receiver_fn)

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=2,
        shuffle=False)
    print(eval_input_fn)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)

    predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data[0]},
        shuffle=False)
    prediction_results = mnist_classifier.predict(predict_input_fn)
    for i in prediction_results:
        print(i)
        print(i['classes'])

Ejemplo n.º 2

0

Mostrar archivo

Archivo: model.py Proyecto: vantai-0501/text_classification

    def _pipeline_w2v_and_recurentneuralnetwork(self, dir_dataset):

        categories = os.listdir(dir_dataset)

        # X_train
        X_train, y_train = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.train.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_train.append(X_tmp[i])
                y_train.append(categories.index(category))

        # X_val
        X_val, y_val = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.dev.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_val.append(X_tmp[i])
                y_val.append(categories.index(category))

        
        # X_test
        X_test, y_test = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.test.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_test.append(X_tmp[i])
                y_test.append(categories.index(category))


        # transform text to vector by word to vec pretrain model
        W2V = WordEmbedding().Word2Vec()
        X_train = self._text2vecs(W2V, X_train)
        X_val = self._text2vecs(W2V, X_val)
        X_test = self._text2vecs(W2V, X_test)
        

        y_train = to_categorical(y_train)
        y_val = to_categorical(y_val)

        input_dim, classes = len(X_train[0][0]), len(categories) 
        model =  rnn_text_classification(input_dim, classes).model

        model.fit(X_train, y_train, epochs = 2, batch_size = 16, validation_data=(X_val, y_val))


        y_hat = model.predict(X_test)
        y_hat = np.argmax(y_hat, axis = 1)

        print(classification_report(y_test, y_hat))
        return categories, W2V, model

Ejemplo n.º 3

0

Mostrar archivo

Archivo: model.py Proyecto: vantai-0501/text_classification

    def _pipeline_bow_and_neuralnetwork(self, dir_dataset):

        categories = os.listdir(dir_dataset)

        # X_train
        X_train, y_train = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.train.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_train.append(X_tmp[i])
                y_train.append(categories.index(category))

        # X_val
        X_val, y_val = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.dev.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_val.append(X_tmp[i])
                y_val.append(categories.index(category))

        
        # X_test
        X_test, y_test = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.test.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_test.append(X_tmp[i])
                y_test.append(categories.index(category))


        # transform text to vector by word of bag
        BoW = WordEmbedding().CountVectorizer()
        X_train = BoW.fit_transform(X_train)
        X_train = X_train.toarray()
        X_val = BoW.transform(X_val).toarray()
        X_test = BoW.transform(X_test).toarray()
        

        y_train = to_categorical(y_train)
        y_val = to_categorical(y_val)

        
        model = Neural_Network(len(X_train[0]), len(categories)).model
        model.fit(X_train, y_train, epochs = 5, batch_size = 16, validation_data=(X_val, y_val))

        y_hat = model.predict(X_test)
        y_hat = np.argmax(y_hat, axis = 1)

        print(classification_report(y_test, y_hat))
        return categories, BoW, model

Ejemplo n.º 4

0

Mostrar archivo

def preprocess_dataset(classes_authorized, components, compression_method, patch_size):
    X, train_data, test_data = pp.load_data()

    train_data = pp.delete_useless_classes(train_data, classes_authorized)
    test_data = pp.delete_useless_classes(test_data, classes_authorized)
    print("Before Shuffle: ")
    pretty_print_count(train_data, test_data)
    train_data, test_data = pp.shuffle_train_test(train_data, test_data)
    print("After Shuffle: ")
    pretty_print_count(train_data, test_data)

    if compression_method is not None:
        X, pca = pp.dimensionality_reduction(X, numComponents=components, standardize=False,
                                             compression=compression_method)

    # CREATE PATCHES, DELETE 0 VALUES
    X_train, X_test, y_train, y_test = pp.patch_1dim_split(X, train_data, test_data, patch_size)

    y_train = np_utils.to_categorical(y_train, num_classes=9)
    y_test = np_utils.to_categorical(y_test, num_classes=9)

    t, v = np.unique(train_data, return_counts=True)
    print(t, v)
    t, v = np.unique(test_data, return_counts=True)
    print(t, v)

    return X, X_train, X_test, y_train, y_test

Ejemplo n.º 5

0

Mostrar archivo

def predict(model, X, X_test, y_test, target_names, classes_authorized, spy_colors, label_dictionary):
    classification, confusion, test_loss, test_accuracy = reports(model, X_test, y_test, target_names)
    print(classification)

    plt.figure(figsize=(13, 10))
    plot_confusion_matrix(confusion, classes=target_names,
                              title='Confusion matrix, without normalization')

    X_garbage, train_data, test_data = pp.load_data()
    y = np.add(train_data, test_data)
    y = pp.delete_useless_classes(y, classes_authorized)

    outputs = create_predicted_image(X, y, model, 5, y.shape[0], y.shape[1])

    print("PREDICTED IMAGE:")
    predict_image = spectral.imshow(classes=outputs.astype(int), figsize=(5, 5))
    label_patches = [patches.Patch(color=spy_colors[x] / 255.,
                                  label=label_dictionary[x]) for x in np.unique(y)]
    plt.legend(handles=label_patches, ncol=2, fontsize='medium',
               loc='upper center', bbox_to_anchor=(0.5, -0.05))
    plt.show()

    ground_truth = spectral.imshow(classes=y, figsize=(5, 5))
    print("IDEAL IMAGE: ")

    label_patches = [patches.Patch(color=spy_colors[x] / 255.,
                                  label=label_dictionary[x]) for x in np.unique(y)]
    plt.legend(handles=label_patches, ncol=2, fontsize='medium',
               loc='upper center', bbox_to_anchor=(0.5, -0.05))
    plt.show()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: main.py Proyecto: wentaohub/onset-replication

def main():
    parser = ArgumentParser(description='Onset Detection Trainer')
    parser.add_argument('--network-type',
                        '-n',
                        required=True,
                        choices=['cnn', 'rnn'],
                        help='network type')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train each model for')
    req_action = parser.add_mutually_exclusive_group(required=True)
    req_action.add_argument('-t',
                            '--train',
                            type=int_range,
                            help='range of models to train')
    req_action.add_argument('-e',
                            '--evaluate',
                            type=int_range,
                            help='range of models to evaluate')
    args = parser.parse_args()

    # Load the eight folds
    nn, folds = load_data(args.network_type)
    print('* Created folds with sizes %s.' % list(map(len, folds)))

    if args.evaluate:
        evaluate(nn, folds, args.evaluate)
    else:
        train(nn, folds, args.train, args.epochs)

Ejemplo n.º 7

0

Mostrar archivo

def find_xgb_best_parameters(test_size=0.2, n_iter_search=20, X=None, y=None):
    if X is None or y is None:
        X, y = pr_kaggle.load_data(cat2vectors=True)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=36)
    param_dist = {
        "n_estimators": [50, 100, 250, 500],
        "max_depth": [10, 5, 15],
        "learning_rate": [0.01, 0.1, 0.0333],
        "subsample": [0.5, 1.0, 0.80],
        #"gamma": [0,0.01],
        #"min_child_weight": [0.5, 1],
        "colsample_bytree": [1.0, 0.5, 0.8, 0.9]
    }
    start = time()
    clf = xgb.XGBClassifier()
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       n_jobs=1)

    print Xtrain.shape
    random_search.fit(Xtrain, ytrain)
    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)

    print 'training', random_search.score(Xtrain, ytrain)
    print 'testing', random_search.score(Xtest, ytest)
    return random_search

Ejemplo n.º 8

0

Mostrar archivo

def load_all_paths():
    train_filenames = ["friday_topknob_bottomknob_switch_slide_0_path",
                "friday_microwave_topknob_bottomknob_hinge_0_path",
                "friday_microwave_kettle_topknob_switch_0_path",
                "friday_microwave_kettle_topknob_hinge_0_path",
                "friday_microwave_kettle_switch_slide_0_path",
                "friday_microwave_kettle_hinge_slide_0_path",
                "friday_microwave_kettle_bottomknob_slide_0_path",
                "friday_microwave_kettle_bottomknob_hinge_0_path",
                "friday_microwave_bottomknob_switch_slide_0_path",
                "friday_microwave_bottomknob_hinge_slide_0_path",
                "friday_kettle_topknob_switch_slide_0_path",
                "friday_kettle_topknob_bottomknob_slide_1_path",
                "friday_kettle_switch_hinge_slide_0_path",
                "friday_kettle_bottomknob_switch_slide_0_path",
                "friday_kettle_bottomknob_hinge_slide_0_path"
                ]
    #join validation and training data
    data_filenames = ["./data/training/%s.pkl"%data_file for data_file in train_filenames]
    data_filenames.append("./data/validation/friday_microwave_topknob_bottomknob_slide_0_path.pkl")
    
    #Load data and transform into sequences
    all_paths = []
    for data_file in data_filenames:
        paths = load_data([data_file])[0]
        load_idx = list( range(0, paths['images'].shape[0], 3) )#skip 3 frames
        for key in paths:
            paths[key] = paths[key][load_idx]
        all_paths.append(paths)
        del paths
    return all_paths

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_preprocessing.py Proyecto: datadonK23/Thoughtful_DL

    def test_load_data(self):
        """
        Test splitting of ratings into texts and labels lists
        """
        (X_train, y_train), (X_test, y_test) = load_data()

        self.assertEqual((25000, ), X_train.shape,
                         "Incorrect shape of training features")
        self.assertEqual((25000, ), X_test.shape,
                         "Incorrect shape of test features")
        self.assertEqual((25000, ), y_train.shape,
                         "Incorrect shape of training labels")
        self.assertEqual((25000, ), y_test.shape,
                         "Incorrect shape of test labels")

        self.assertEqual(list, X_train.dtype,
                         "Incorrect type of values in training features")
        self.assertEqual(list, X_test.dtype,
                         "Incorrect type of values in test features")
        self.assertEqual(np.int64, y_train.dtype,
                         "Incorrect type of values in training labels")
        self.assertEqual(np.int64, y_test.dtype,
                         "Incorrect type of values in test labels")

        self.assertEqual(
            set([0, 1]), set(y_train),
            "Labels list contains other values than 0 or 1 in trainset")
        self.assertEqual(
            set([0, 1]), set(y_test),
            "Labels list contains other values than 0 or 1 in testset")

Ejemplo n.º 10

0

Mostrar archivo

def run():
    print("Loading data ...")
    X_train, Y_train, X_test, submission_file_content = preprocessing.load_data()

    print("Performing conversion ...")
    X_train = preprocess_images(X_train)
    X_test = preprocess_images(X_test)
    categorical_Y_train, encoder = preprocess_labels(Y_train)

    model = init_model(np.unique(Y_train).size)
    if not os.path.isfile(OPTIMAL_MODEL_FILE_PATH):
        print("Performing the training phase ...")

        if not os.path.isdir(MODEL_FOLDER_PATH):
            os.makedirs(MODEL_FOLDER_PATH)

        earlystopping_callback = EarlyStopping(patience=1)
        modelcheckpoint_callback = ModelCheckpoint(OPTIMAL_MODEL_FILE_PATH, save_best_only=True)
        model.fit(X_train, categorical_Y_train, batch_size=BATCH_SIZE, nb_epoch=1,
                  callbacks=[earlystopping_callback, modelcheckpoint_callback],
                  validation_split=0.2, show_accuracy=True)

    print("Loading the optimal model ...")
    model.load_weights(OPTIMAL_MODEL_FILE_PATH)

    print("Generating prediction ...")
    temp_predictions = model.predict(X_test, batch_size=BATCH_SIZE)
    prediction = encoder.inverse_transform(temp_predictions)

    print("Writing prediction to disk ...")
    submission_file_name = "Aurora_{:.4f}_{:d}.csv".format(EarlyStopping.best, int(time.time()))
    submission_file_content[preprocessing.LABEL_COLUMN_NAME_IN_SUBMISSION] = prediction
    submission_file_content.to_csv(submission_file_name, index=False)

    print("All done!")

Ejemplo n.º 11

0

Mostrar archivo

def main():
    # loads and preprocesses data. See `preprocessing.py`
    data, labels, vocabs = load_data(data_dir='./data')

    # get embedding
    # embedding = get_embedding(vocabs)
    # trains a classifier on `train` and `dev` set. See `model.py`
    for unit1 in units_1st:
        for unit2 in units_2nd:
            for em_dim in embedding_dims:
                print(
                    "first layer units, \t second layer units \t embedding dimension"
                )
                print(unit1, unit2, em_dim)
                clf = DRSClassifier(train_labels=labels['train'],
                                    dev_labels=labels['dev'],
                                    vocabs=vocabs,
                                    embedding_dim=em_dim,
                                    unit_1st=unit1,
                                    unit_2nd=unit2)
                clf.train(train_instances=data['train'],
                          dev_instances=data['dev'])

                # output model predictions on `test` set
                preds_file = "./preds.json"
                clf.predict(data['test'], export_file=preds_file)

                # measure the accuracy of model predictions using `scorer.py`
                run_scorer(preds_file)

Ejemplo n.º 12

0

Mostrar archivo

def test_gcb(Xy=None, n_estimators=100, max_depth=10, test_size=0.1):
    if Xy is None:
        X, y = pr_kaggle.load_data()
    else:
        X, y = Xy
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=36)
    dc = lambda: GBC(
        learning_rate=0.02,
        n_estimators=n_estimators,
        max_depth=max_depth,
        #    min_samples_split = 4,
        # subsample = 0.8, max_features = 0.66
    )
    clf = dc()
    check_classifier(Xtrain, ytrain, Xtest, ytest, clf)

    clf = dc()
    clfbag = BaggingClassifier(clf, n_estimators=5)
    check_classifier(Xtrain, ytrain, Xtest, ytest, clfbag)

    clf = dc()
    clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
    check_classifier(Xtrain, ytrain, Xtest, ytest, clf_isotonic)

Ejemplo n.º 13

0

Mostrar archivo

def main(unused_argv):
    # Load training and eval data
    #mnist = tf.contrib.learn.datasets.load_dataset("mnist")
    #train_data = mnist.train.images  # Returns np.array
    #train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
    #eval_data = mnist.test.images  # Returns np.array
    #eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)

    train_data, train_labels = load_data(TRAIN_DATA_DIRECTORY)
    eval_data, eval_labels = load_data(TEST_DATA_DIRECTORY)

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir="./tmp/mnist_convnet_model")

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=50)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data},
                                                        y=train_labels,
                                                        batch_size=100,
                                                        num_epochs=None,
                                                        shuffle=True)
    mnist_classifier.train(input_fn=train_input_fn,
                           steps=1000,
                           hooks=[logging_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print('eval_results: {}'.format(eval_results))

    predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data[2]}, shuffle=False)
    prediction_results = mnist_classifier.predict(predict_input_fn)
    for i in prediction_results:
        print("i: {}".format(i))
        print("i['classes']: {}".format(i['classes']))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: train.py Proyecto: purpleFar/car-brand-classification

def main():
    args = parse_args()
    train_datapath = args.train_dir
    model_save_path = args.model_save_dir

    label = load_obj(os.path.join("preprocess_file", "label"))

    [train_file, train_labels] = load_data(train_datapath, label, False)
    k_fold_num = 15
    k_fold = cut_CV_data(train_labels, k=k_fold_num)  # cross validation K-fold
    train_file = np.array(train_file)
    train_labels = np.array(train_labels)

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    for i in range(k_fold_num - 1):
        train_f, train_l = np.array([]), np.array([])
        for k in range(k_fold_num):
            if k != i:
                train_f = np.concatenate((train_f, train_file[k_fold[k]]), axis=0)
                train_l = np.concatenate((train_l, train_labels[k_fold[k]]), axis=0)
        train_dataset = Car196Dataset(
            [train_f, train_l], input_transform=my_transform, is_train=True
        )
        valid_dataset = Car196Dataset(
            [train_file[k_fold[i]], train_labels[k_fold[i]]], is_train=False
        )
        train_loader = DataLoader(
            train_dataset, num_workers=4, batch_size=16, shuffle=True
        )
        valid_loader = DataLoader(
            valid_dataset, num_workers=4, batch_size=16, shuffle=False
        )

        net = torch.hub.load(
            "pytorch/vision:v0.6.0", "wide_resnet50_2", pretrained=True
        )
        net.fc = nn.Linear(2048, 196)
        net = net.to(device)
        optimizer = optim.SGD(
            net.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0045
        )
        stepLR = optim.lr_scheduler.StepLR(optimizer, 1000, gamma=0.8)

        train_early_stop(
            net,
            train_loader,
            valid_loader,
            stepLR,
            n_steps=1000,
            p=6,
            savefile=os.path.join(model_save_path, "best_model{}.pt".format(i)),
            show_acc=True,
            return_log=True,
            device=device,
        )

Ejemplo n.º 15

0

Mostrar archivo

Archivo: temporal_tsne_analysis.py Proyecto: ErickRosete/Robot-Skills-from-Video

def load_paths():
    valid_filenames = [
        "friday_microwave_topknob_bottomknob_slide_%d_path" % i
        for i in [3, 6, 8, 9, 11]
    ]
    data_filenames = [
        "./data/validation/%s.pkl" % data_file for data_file in valid_filenames
    ]
    paths = load_data(data_filenames)
    return paths

Ejemplo n.º 16

0

Mostrar archivo

def show_answer():
    data = load_data(file_path.get())
    Ans = model.predict(data)
    for val in Ans:
        if val[0] > 0.5:
            blank.insert(0, round(val[0] - 0.5 * random.random(), 2))
            blank.insert(0, ", ")
        else:
            blank.insert(0, round(0.5 * random.random(), 2))
            blank.insert(0, ", ")

Ejemplo n.º 17

0

Mostrar archivo

Archivo: main.py Proyecto: mgmoran/Neural_Discourse_Relation_Classification

def main():
    data = load_data(data_dir='./data')
    ### Edit any hyperparameters here, including model type. ###
    clf = DRSClassifier(model_type='FFN', batches=64, epochs=5)
    clf.train(train_instances=[pair[0] for pair in data['train']],
              dev_instances=[pair[0] for pair in data['dev']])
    preds_file = "./preds.json"
    clf.predict(data['test'], export_file=preds_file)

    run_scorer(preds_file)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: driver.py Proyecto: feingoldj/ml_project

def main():
    # X, y, stock_data = load_data()
    print("1. Loading data")
    X, y, sd = load_data()
    print("Finished loading data\n")

    print("2. Preprocessing tweets")
    processed_X = process_tweets(X)
    print("Finished preprocessing tweets\n")
    #
    print("3. Extracting bag-of-words features")
    X_bow_features = bow_build_X(processed_X)
    print("Finished extracting bag-of-word features\n")

    X_features_stocks = add_stock_feature(X_bow_features, sd)

    # #
    print("4. Train and test BoW NB")
    train_and_test_NB(X_bow_features, y)
    print("Finished training and testing NB\n")
    # #
    print("5. Train and test BoW NB with stocks")
    train_and_test_NB(X_features_stocks, y)
    print("Finished training and testing NB\n")
    # #
    print("6. Training BoW models")
    train_models(X_bow_features, y, True)
    print("Finished training models\n")

    print("7. Training BoW models with stocks")
    train_models(X_features_stocks, y, True)
    print("Finished training models\n")
    #
    print("8. Extracting word2vec features")
    X_tweet2vec_features = tweet2vec_build_X(X)
    print("Finished extracting word2vec features\n")
    #
    X_features_stocks = add_stock_feature(X_tweet2vec_features, sd)
    #
    print("9. Train and test w2v NB")
    train_and_test_NB(X_tweet2vec_features, y)
    print("Finished training and testing NB\n")
    #
    print("10. Train and test w2v NB with stocks")
    train_and_test_NB(X_features_stocks, y)
    print("Finished training and testing NB\n")
    #
    print("11. Training w2v models")
    train_models(X_tweet2vec_features, y)
    print("Finished training models\n")
    #
    print("12. Training w2v models with stocks")
    train_models(X_features_stocks, y)
    print("Finished training models\n")

Ejemplo n.º 19

0

Mostrar archivo

Archivo: finder_galx.py Proyecto: Leia-Team/Titanic-Kaggle-

def app():
    st.title('Finder')
    step1 = st.button('Did you get the Coordinate?', key=1)
    step2 = st.button('Checking the Coordinate', key=2)
    step3 = st.button('Processing', key=3)
    step4 = st.button('Finding Place of ..', key=4)
    if step1:

        st.markdown(
            "![Alt Text](https://media.giphy.com/media/4I72kivfGWFDi3Yhkc/giphy.gif)"
        )
        st.markdown(
            "<h3 style='text-align: center; color: red;'>Coordinate :  9.16085726217318 <------> 8.807258629151487</h3>",
            unsafe_allow_html=True,
        )
    if step2:

        st.markdown(
            "![Alt Text](https://media.giphy.com/media/4PWnEOqI4DsgA699ix/giphy.gif)"
        )
        data = load_data("planet")
        st.subheader('Look at the Data')
        data_load_state = st.text('Loading data...')
        st.write(data.head(10))
        # Notify the reader that the data was successfully loaded.
        data_load_state.text('Loading data...done!')

    if step3:
        st.header('Procedures')
        st.markdown('* Find the point  form  locations list')
        st.markdown(
            '* Creating model, fit it, finding mean values of coordinates')
        st.markdown('* Show  the location in map ')
        st.text(' ')
        st.subheader("Locations in the map")
        image = Image.open('./graphics/map.png')
        st.image(image, caption="Yoda is  at one of them ")
        st.text(' ')
        st.markdown(
            "![Alt Text](https://media.giphy.com/media/c20UV66B7zCWA/giphy.gif)"
        )
    if step4:
        st.header('Finally')
        st.markdown('* Where  is the little Baby ')
        st.text(' ')

        image = Image.open('./graphics/pla.png')
        st.image(image, caption="Do you see ?")
        st.text(' ')
        st.markdown(
            "![Alt Text](https://media.giphy.com/media/YTPO05SueTPez1Lr99/giphy.gif)"
        )
        st.balloons()

Ejemplo n.º 20

0

Mostrar archivo

def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    #train_data, _ = thinc.extra.datasets.imdb()
    train_data = preprocessing.load_data(include_body=False)
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data) #a tuple(iterable) full of text and a tuple full of labels


    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

Ejemplo n.º 21

0

Mostrar archivo

Archivo: model.py Proyecto: vantai-0501/text_classification

    def _pipeline_bow_and_multinomialNB(self, dir_dataset):

        categories = os.listdir(dir_dataset)

        # X_train
        X_train, y_train = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.train.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_train.append(X_tmp[i])
                y_train.append(y_tmp[i])

        
        # X_test
        X_test, y_test = [], []
        for category in tqdm(categories):
            path = os.path.join(dir_dataset, category, 'evc.test.en')
            X_tmp, y_tmp = load_data(path)
            for i in range(len(X_tmp)):
                X_test.append(X_tmp[i])
                y_test.append(y_tmp[i])


        # transform text to vector by word of bag
        BoW = WordEmbedding().CountVectorizer()
        X_train = BoW.fit_transform(X_train)
        X_train = X_train.toarray()
        print(X_train.shape, len(y_train))

        X_test = BoW.transform(X_test).toarray()

        model = MultinomialNB_custom()
        model.fit(X_train, y_train)

        y_hat = model.predict(X_test)
        print(classification_report(y_test, y_hat))
        return BoW, model

Ejemplo n.º 22

0

Mostrar archivo

def main():

    # Load dataset by uncommenting any of the required dataset.
    # Run the inputDataCollector first if you want to uncomment any of the below dataset
    #     data = pp.load_data('Dataset/stocknews/RedditNews.csv')
    #     data = pp.load_data('Dataset/tweets.csv')
    #     data = pp.load_data('Dataset/news.csv')
    data = pp.load_data('Dataset/news.csv')
    data = pd.DataFrame(data)

    # This line is to concatenate the title & summary in a news articles for the export dataset
    text = list(data["TITLE"] + data["SUMMARY"])

    # Uncomment the below line for all the other datasets
    #     text = list(data["Text"])

    # calculate the labels for the task
    labels = np.array(list(data["Label"]))

    # preprocess the dataset
    text = pp.preprocess(text)

    #tokenize the text
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index

    # padding sequences to ensure all rows are of equal length
    max_review_length = 1000
    text = pad_sequences(sequences,
                         maxlen=max_review_length,
                         padding='pre',
                         truncating='pre')

    #split the data into 80-20 train-test
    X_train = text[:int(0.8 * len(text))]
    X_val = text[-int(0.2 * len(text)):]
    y_train = labels[:int(0.8 * len(text))]
    y_val = labels[-int(0.2 * len(text)):]

    y_train = [int(i) for i in y_train]
    y_val = [int(i) for i in y_val]

    y_train = np.array(y_train)
    y_val = np.array(y_val)

    # build the model & validate the results
    nn.build(word_index, X_train, y_train, X_val, y_val)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: vectorizer.py Proyecto: lackos/Disaster_Tweet_Classifier-NLP-

def main(create_sub=False):
    ## Load the data
    train_df, test_df = pp.load_data()

    ## Lowercase all the text in the tweets
    train_df = pp.lowercase_df(train_df)
    test_df = pp.lowercase_df(test_df)

    nlp = spacy.load('en_core_web_lg')

    # vecs = create_word_vectors(nlp, 'linear_vectorizer_lower_case_hashtag', output=True, disable_pipes=False)
    vecs = load_word_vectors('linear_vectorizer_lower_case',
                             'training_word_vectors.npy')

    X_train, X_val, y_train, y_val = train_test_split(vecs,
                                                      train_df['target'],
                                                      test_size=0.1,
                                                      random_state=1)

    ## Hyper-parameter optimization
    # Grid_search_CV(X_train, y_train)
    # best_params = Radial_SVM_Random_search_CV(X_train, y_train, 40)

    ## Fit and run models
    model_name = "RBF_SVM_lowercase"
    # The best parameters are {'gamma': 0.001, 'C': 10000.0} with a score of 0.77
    # rbf_svm, valid_preds = radial_SVM(X_train, X_val, y_train, y_val, {'gamma': 0.1, 'C': 10.0})
    # save_model(rbf_svm, "RBF_SVM_lowercase")
    rbf_svm = load_model(model_name)
    predictions = rbf_svm.predict(X_val)

    # polynomial_SVM(X_train, X_val, y_train, y_val)

    ## Save the model and generate the performance report.
    model_dir = os.path.join(MODEL_DIR, model_name.title())
    model_eval.generate_model_report(model_name, model_dir, y_val, predictions)

    if create_sub == True:
        # Apply model on Test data set for submission
        with nlp.disable_pipes():
            test_vectors = np.array([
                nlp(tweet['text']).vector for idx, tweet in test_df.iterrows()
            ])
        # nlp.add_pipe(hashtag_pipe)
        # test_vectors = np.array([nlp(tweet['text']).vector for idx, tweet in train_df.iterrows()])

        preds_test = rbf_svm.predict(test_vectors)
        print(preds_test)
        create_submission("rbf_lowercase_submission.csv", preds_test, test_df)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: model.py Proyecto: catSirup/Convolutional-neural-network-for-sentence-classification

def load_data():
    x, y, vocabulary, vocabulary_inv_list = preprocessing.load_data()
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}

    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x = x[shuffle_indices]
    y = y[shuffle_indices]
    train_len = int(len(x) * 0.9)
    x_train = x[:train_len]
    y_train = y[:train_len]
    x_test = x[train_len:]
    y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv

Ejemplo n.º 25

0

Mostrar archivo

Archivo: main.py Proyecto: meiqw/Neural-Discourse-Relation-Classification

def main():
    # loads and preprocesses data. See `preprocessing.py`
    data = load_data(data_dir='./data')

    # trains a classifier on `train` and `dev` set. See `model.py`
    clf = DRSClassifier()
    clf.train(train_instances=data['train'], dev_instances=data['dev'])

    # output model predictions on `test` set
    preds_file = "./preds.json"
    clf.predict(data['test'], export_file=preds_file)

    #print(preds)

    # measure the accuracy of model predictions using `scorer.py`
    run_scorer(preds_file)

Ejemplo n.º 26

0

Mostrar archivo

def sentiment_analysis(dataset):

    if dataset == train_set and os.path.isfile(sentiment_train_pkl):
        return load_sentiment(sentiment_train_pkl)

    if dataset == val_set and os.path.isfile(sentiment_val_pkl):
        return load_sentiment(sentiment_val_pkl)

    nltk.download('vader_lexicon', quiet=True)

    # load data from csv
    data_original = load_data(dataset)
    # print(data_original)
    #Only go through the first 10 entries of dataset - Remove for entire dataset
    # data_original = data_original.head(20)

    sid = SentimentIntensityAnalyzer()

    sentiment_score = pd.DataFrame(columns=['compound', 'neg', 'neu', 'pos'])
    story_idx = 0
    #iterate through dataframe for sentiment analysis
    for index, row in data_original.iterrows():
        #print(row)
        story_to_complete = " ".join(
            [row['sen1'], row['sen2'], row['sen3'], row['sen4']])
        #story_to_complete = "'''{0}'''".format(story_to_complete)
        # print(story_to_complete)
        scores = sid.polarity_scores(story_to_complete)
        story_idx = story_idx + 1
        if (story_idx % 10000 == 0):
            print(story_idx, "/", data_original.shape[0])
        for key in sorted(scores):
            # print('{0}:{1}, '.format(key, scores[key]), end='')
            #print(scores[key])
            sentiment_score.loc[index] = scores

    if dataset == train_set:
        with open(sentiment_train_pkl, 'wb') as output:
            pickle.dump(sentiment_score, output, pickle.HIGHEST_PROTOCOL)
    elif dataset == val_set:
        with open(sentiment_val_pkl, 'wb') as output:
            pickle.dump(sentiment_score, output, pickle.HIGHEST_PROTOCOL)

    return sentiment_score

Ejemplo n.º 27

0

Mostrar archivo

def run():
    print("Resetting the submission folder {:s} ...".format(os.path.basename(submission_folder_path)))
    shutil.rmtree(submission_folder_path, ignore_errors=True)
    os.makedirs(submission_folder_path)

    print("Loading data ...")
    X_train, Y_train, X_test, submission_file_content = preprocessing.load_data()

    print("Tuning parameters ...")
    optimal_max_depth, optimal_min_child_weight, optimal_subsample, optimal_colsample_bytree = perform_tuning(X_train, Y_train)

    print("Training ...")
    optimal_learning_rate = 0.05
    estimator = XGBClassifier(max_depth=optimal_max_depth, learning_rate=optimal_learning_rate, n_estimators=1000000,
                              min_child_weight=optimal_min_child_weight, subsample=optimal_subsample,
                              colsample_bytree=optimal_colsample_bytree, objective=OBJECTIVE)
    generate_prediction(estimator, X_train, Y_train, X_test, submission_file_content, early_stopping_rounds=200, cv_num=20)

    print("All done!")

Ejemplo n.º 28

0

Mostrar archivo

def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH):
    """
    Creates and trains a MLkNN classifier using the optimized parameters found
    Saves this trained model to disk

    :param string file_path: specifies where the model should be saved
    :return: a trained sklearn MLkNN classifier
    """

    with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file:
        hyperparameters = json.load(file)['hyperparameters']

    question_data, music_data = preprocessing.load_data()
    question_data, music_data = preprocessing.preprocess_data(
        question_data, music_data)
    clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s'])
    clf.fit(question_data.values, music_data.values)
    pickle.dump(clf, open(file_path, 'wb'))
    return clf

Ejemplo n.º 29

0

Mostrar archivo

def create_model(file_path=FINAL_XGBOOST_MODEL_FILE_PATH):
    """
    Creates and trains a OneVsRestClassifier(XGBClassifier()) using the optimized parameters found
    Saves this trained model to disk

    :param string file_path: specifies where the model should be saved
    :return: a trained OneVsRestClassifier
    """

    with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file:
        hyperparameters = json.load(file)['hyperparameters']

    question_data, music_data = preprocessing.load_data()
    question_data, music_data = preprocessing.preprocess_data(
        question_data, music_data)
    xgb_model = XGBClassifier(**hyperparameters)
    xgb_clf = OneVsRestClassifier(xgb_model, n_jobs=-1)
    xgb_clf.fit(question_data, music_data)
    pickle.dump(xgb_clf, open(file_path, 'wb'))
    return xgb_clf

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test.py Proyecto: purpleFar/car-brand-classification

def main():
    args = parse_args()
    test_datapath = args.test_dir
    model_dir = args.model_dir
    save_name = args.save_name

    num2name = load_obj(os.path.join("preprocess_file", "num_to_name"))
    test_dataset = Car196Dataset(load_data(test_datapath, clean=False), is_train=False)

    test_loader = DataLoader(test_dataset, num_workers=4, batch_size=16, shuffle=False)

    net = torch.hub.load("pytorch/vision:v0.6.0", "wide_resnet50_2", pretrained=False)
    net.fc = nn.Linear(2048, 196)
    net = net.to(device)

    path_pattern = model_dir + "/**/*.*"
    files_list = glob.glob(path_pattern, recursive=True)

    csv_list = [["id", "label"]]
    tmp = 0
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = torch.zeros(
            inputs.shape[0], 196, dtype=torch.float64, device=device
        ).data
        for file_name in files_list:
            net.load_state_dict(torch.load(file_name))
            net.eval()
            outputs += nn.Softmax(dim=1)(net(inputs)).data
        _, preds = outputs.max(1)
        for index, pred in enumerate(preds):
            input_file = test_dataset.image_filenames[tmp + index]
            id_ = int(basename(input_file).split(".")[0])
            csv_list.append([id_, num2name[pred.item()]])
        tmp += index + 1

    if not os.path.exists("result"):
        os.makedirs("result")
    with open(os.path.join("result", save_name), "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(csv_list)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: calc_tsne.py Proyecto: ioanachelu/word-sent-classif

def resume_model():

  x, y, vocabulary, vocabulary_inv = pre.load_data()
  # Randomly shuffle data
  shuffle_indices = np.random.permutation(np.arange(len(y)))
  x_shuffled = x[shuffle_indices]
  y_shuffled = y[shuffle_indices]
  x_train, x_val = x_shuffled[:-1000], x_shuffled[-1000:]
  sess = tf.Session()
  cnn = SentenceCNN(
      sequence_length=x_train.shape[1],
      num_classes=2,
      vocab_size=len(vocabulary),
      sess=sess
  )
  cnn.inference()
  cnn.train()

  # Create a saver.
  saver = tf.train.Saver()

  checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))
  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
  ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
  if ckpt and ckpt.model_checkpoint_path:
    # Restores from checkpoint

    saver.restore(sess, ckpt.model_checkpoint_path)
    cnn.sess = sess
    # Assuming model_checkpoint_path looks something like:
    #   /my-favorite-path/cifar10_train/model.ckpt-0,
    # extract global_step from it.

    global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]

    return cnn
  else:
    print('No checkpoint file found. Cannot resume.')
    return None

Ejemplo n.º 32

0

Mostrar archivo

Archivo: sgd.py Proyecto: usartkom/autoencoders

def run_sparse_autoencoder(N, image_size, patch_size, prepare_data=True):
    # images_all, y, images_repr = prepare_data()

    # open training data
    print "Trainig data!"
    train_path = 'data/train_32x32.mat'
    test_path = 'data/test_32x32.mat'
    file_train = "data/pickles/train.pickle"
    file_val = "data/pickles/val.pickle"

    if prepare_data:
        X, y = prepr.load_data(train_path)
        X_test, y_test = prepr.load_data(test_path)
        prepr.normalize_and_pickle(X, y, X_test, y_test)
        print "Training data were loaded and normalized!"

    images_train = helper.unpickle_data(file_train)[:, :, :N]
    images_val = helper.unpickle_data(file_val)
    images_repr = images_val[:, :, :36]

    # theta = init.initialize_k_deep_sparse_autoencoder(patch_size, image_size)
    theta = init.init_original_model(image_size)

    # max_iter = 5
    # batch_size = 1000
    # n_batches = N // batch_size
    # print "n_batches: ", n_batches
    # learning_rate = 1e-3
    # learning_rate_decay = 0.95
    # mu = 0.9
    lambda_ = 0.001
    # iter = 0
    # v = {}
    # whole_loss_history = []
    # train_loss_history = []
    # val_loss_history = []
    # while iter < max_iter:
    #     iter += 1
    #     s = 0
    #     for b in range(n_batches):
    #         batch_begin = b * batch_size
    #         N_average = (b + 1) * batch_size
    #         batch_end = batch_begin + batch_size
    #         X_batch = images_train[:, :, batch_begin:batch_end]
    #         cost, grad = model.k_sparse_deep_autoencoder_cost_without_patches(theta, lambda_, X_batch, patch_size, image_size, batch_size, patch_size)
    #         whole_loss_history.append(cost)

    #         # momentum update
    #         for item in grad:
    #             if item not in v:
    #                 v[item] = np.zeros(grad[item].shape)
    #             v[item] = mu * v[item] - learning_rate * grad[item]
    #             theta[item] += v[item]

    #     mask = np.random.choice(N, 1000)
    #     train_subset = images_train[:, :, mask]
    #     cost_train = model.k_sparse_deep_autoencoder_cost_without_patches(theta, lambda_, train_subset, patch_size, image_size, 1000, patch_size)[0]
    #     train_loss_history.append(cost_train)
    #     cost_val = model.k_sparse_deep_autoencoder_cost_without_patches(theta, lambda_, images_val, patch_size, image_size, images_val.shape[2], patch_size)[0]
    #     val_loss_history.append(cost_val)
    #     print "Cost_train: ", cost_train, ", cost_val: ", cost_val, ", epoch: ", iter, " learning_rate: %d", (learning_rate)
    #     learning_rate *= learning_rate_decay

    # print "Check gradients!"
    # lambda_ = 0.1
    l_cost, l_grad = original_model.k_sparse_original_model(theta, lambda_, images_train, patch_size, image_size, N, 
15)
#     helper.check_sparsity_of_gradients(l_grad, 'W3')
    # J = lambda x: model.k_sparse_deep_autoencoder_cost_without_patches(x, lambda_, images_train, patch_size, image_size, N, 2)
    # gradient_check.compute_grad(J, theta, l_grad)
    J = lambda x: original_model.k_sparse_original_model(x, lambda_, images_train, patch_size, image_size, N, 15)
    gradient_check.compute_grad(J, theta, l_grad)

Ejemplo n.º 33

0

Mostrar archivo

Archivo: get_recommendations.py Proyecto: xueharry/CS51-Final-Project

# Returns 10 movie recommendations given user's age, gender and occupation
import numpy as np
import preprocessing
import testing
from npkmeans import kmeans

# Run k means
km = kmeans(preprocessing.load_data("u.data", 1), 100)
kmeans_return = km.kCluster()
clusters = kmeans_return[0]
centroids = kmeans_return[1]
testing.calculate(clusters, centroids)

def sim_users_ratings(age, gender, occupation):
    "Calculates similar users, finds those users' centroids, averages centroid ratings and returns a dictionary of sorted movie indexes"
    
    # Load users data
    users = preprocessing.user_load_data("u.user")
    sim_users = []

    # Iterate through users, adding indexes of users with same age, gender and occupation
    for position, user in enumerate(users):
	if (age, gender, occupation) == (user[0], user[1], user[2]):
		sim_users.append(position)
		
    # Check whether sim_users is empty, if it is, then....
    if len(sim_users) == 0:
        for position, user in enumerate(users):
            if (age, occupation) == (user[0], user[2]):
                sim_users.append(position)
        print ("No exact match found, matching for age and occupation...")

Ejemplo n.º 34

0

Mostrar archivo

Archivo: eval_test.py Proyecto: scharmchi/cnn-sentence-kim-tf

tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")

# Misc params
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Load my own test data here
_, _, vocab, vocab_inv = preprocessing.load_data(is_train=True)
x_test, y_test, _, _ = preprocessing.load_data(is_train=False)
y_test = np.argmax(y_test, axis=1)
print("Vocabulary size: {:d}".format(len(vocab)))
print("Test set size: {:d}".format(len(y_test)))

print("\nEvaluating on test data..\n")

checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():

Ejemplo n.º 35

0

Mostrar archivo

Archivo: train.py Proyecto: scharmchi/cnn-sentence-kim-tf

lambda_l2_param_search = [0.001, 0.01, 0.1, 1, 10]
embedding_dim_param_search = [64, 128, 256]

tf.flags.FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(tf.flags.FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

###########################################
# Data Preparation
###########################################

print("loading data...")
x, y, vocab, vocab_inv = preprocessing.load_data()
# Shuffle data randomly
np.random.seed(10)

# shuffled_indices = np.random.permutation(np.arange(len(y)))
# x_shuffled = x[shuffled_indices]
# y_shuffled = y[shuffled_indices]

# split train/valid set
# TODO Implement a f*****g correct XVal procedure for this
x_train, x_valid = x[:-500], x[-500:]
y_train, y_valid = y[:-500], y[-500:]
print("Vocabulary Size: {:d}".format(len(vocab)))
print("Train/Validation split: {:d}/{:d}".format(len(y_train), len(y_valid)))

###########################################