Ejemplo n.º 1
0
def pre_process_to_pickle(version='large'):
    """Multi-layer Perceptron is sensitive to feature scaling;
        this function uses MinMaxScaler to scale input features between (-1,1)
    
    Returns:
        scaled: train_X, val_X
    """
    train_X, val_X, train_y, val_y = load.read_data(bert=version)
    train_X.date = pd.to_datetime(train_X.date).astype('int64')
    train_X = pd.concat([
        train_X.drop('review', axis=1),
        pd.DataFrame(train_X.review.tolist(), dtype=np.float32)
    ],
                        axis=1)
    val_X.date = pd.to_datetime(val_X.date).astype('int64')
    val_X = pd.concat([
        val_X.drop('review', axis=1),
        pd.DataFrame(val_X.review.tolist(), dtype=np.float32)
    ],
                      axis=1)

    cols = ['ex_id', 'user_id', 'prod_id', 'rating', 'date']
    scaler = MinMaxScaler(feature_range=(-1, 1)).fit(train_X[cols])

    train_X[cols] = scaler.transform(train_X[cols])
    val_X[cols] = scaler.transform(val_X[cols])

    path = load.get_data_path()
    with open(f'{path}train_{version}.pickle', 'wb') as file:
        pickle.dump(train_X, file)

    with open(f'{path}dev_{version}.pickle', 'wb') as file:
        pickle.dump(val_X, file)

    return train_X, val_X
Ejemplo n.º 2
0
def main():
    '''loads datasets into dataframes using load.py and prints head(2) for each
    '''
    print("reading data")
    train_X, val_X, train_y, val_y = load.read_data()

    # downsample for testing
    print("downsampling to head(20)")
    tx = train_X.head(20).copy()
    vx = val_X.head(20).copy()
    ty = train_y.head(20).copy().values.ravel()
    vy = val_y.head(20).copy().values.ravel()

    transformers = ['cv', 'tfidf', 'w2v', 'bert']
    models = ['svm', 'nb', 'nn', 'lr']
    save_plots = True

    for t in transformers:
        t_ = eval('kt.' + t)(tx.review)
        tx_ = t_.transform(tx.review.copy())
        vx_ = t_.transform(vx.review.copy())
        for m in models:
            m_ = eval('km.' + m)().fit(tx_, ty)
            print(f'mean accuracy {t}|{m}: {m_.score(vx_,vy)}')
            if save_plots:
                km.metrics(m_, vx_, vy, f'{t}-{m}')
Ejemplo n.º 3
0
def main(debug=False):
    '''loads datasets into dataframes using load.py and prints head(2) for each
    '''
    if debug:
        print("reading data")

    train_X, val_X, train_y, val_y = load.read_data(debug)

    if debug:
        print("data read")

    print(
        f'train_X: cols={train_X.columns.values}\n dtype={train_X.dtypes}\n shape={train_X.shape}\n'
    )
    print(train_X.head(2))
    print(
        f'val_X: cols={val_X.columns.values}\n dtype={train_X.dtypes}\n shape={val_X.shape}\n'
    )
    print(val_X.head(2))
    print(
        f'train_y: cols={train_y.columns.values}\n dtype={train_X.dtypes}\n shape={train_y.shape}\n'
    )
    print(train_y.head(2))
    print(
        f'val_y: cols={val_y.columns.values}\n dtype={train_X.dtypes}\n shape={val_y.shape}\n'
    )
    print(val_y.head(2))
Ejemplo n.º 4
0
def main():
    train_X, val_X, train_y, val_y = load.read_data(bert='large', debug=True)

    # Grid Search Params
    param_space = {
        'hidden_layer_sizes': [(255, 100, 50), (50, 50, 50), (255, ), (100, ),
                               (50, )],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['sgd', 'adam', 'lbfgs'],
        'alpha':
        10.0**-np.arange(1, 4),
        'learning_rate': ['constant', 'adaptive', 'invscaling'],
        'max_iter': [100, 200, 300],
    }

    # Test Params -- soon to be Best Params, after I Grid Search
    # params = {
    #     'hidden_layer_sizes': (255,),
    #     'activation': 'relu',
    #     'solver': 'adam',
    #     'alpha': 1e-05,
    #     'learning_rate': 'adaptive',
    #     'max_iter': 300,
    # }

    # Grid search
    NN = searchNN(train_X, train_y, param_space)

    # Or single run
    # NN = MLPClassifier(**params).fit(train_X, train_y)

    path = load.get_data_path()
    with open(path + 'pickle_clf.pickle', 'wb') as file:
        pickle.dump(NN, file)

    # check train data
    y_truth, y_pred, y_prob = train_y, NN.predict(train_X), NN.predict_proba(
        train_X)
    score = NN.score(train_X, train_y)
    print(f"Train Score: {100 * score:.2f}")
    try:
        plots_probs = save_ROC_plot(y_prob, y_truth, score, 'train')
        print(f"Train ROC AUC: {plots_probs}")
    except:
        pass

    # check test data
    y_truth, y_pred, y_prob = val_y, NN.predict(val_X), NN.predict_proba(val_X)
    score = NN.score(val_X, val_y)
    print(f"Test Score: {100 * score:.2f}")
    try:
        plots_probs = save_ROC_plot(y_prob, y_truth, score, 'test')
        print(f"Test ROC AUC: {plots_probs}")
    except:
        pass

    with open(path + 'pickle_nn.pickle', 'wb') as file:
        pickle.dump(NN.best_estimator_, file)
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar  3 01:51:10 2021

@author: Otto
"""
import numpy as np
import scipy
import h5py
import hdf5storage
from numpy import pi, sqrt
from load import read_data
from functions import if2gputorre3d, frequency2timedomain, qam_demod

data = read_data()
f_min = data.variables.f_min
f_max = data.variables.f_max
pulse_length = data.variables.pulse_length
carrier_freq = data.variables.carrier_freq
spatial_scaling_constant = data.variables.spatial_scaling_constant
temporal_scaling_constant = data.variables.temporal_scaling_constant
mu_0 = data.variables.mu_0
eps_0 = data.variables.eps_0
c_0 = data.variables.c_0
or_radius = data.variables.or_radius
or_buffer = data.variables.or_buffer
s_radius = data.variables.s_radius
t_shift = data.variables.t_shift
reference_amplitude = data.variables.reference_amplitude
time_series_count = data.variables.time_series_count
t_1 = data.variables.t_1
Ejemplo n.º 6
0
 def load(self):
     self.print_debug("loading data from file")
     self.train_X, self.val_X, self.train_y, self.val_y = load.read_data(
         self.debug)