Exemple #1
0
def predict(hours):
    series = pp.get_data()
    X = series.values
    history = [x for x in X]
    hours_in_week = 168
    validation = pp.get_validate()
    y = validation.values

    model_fit = ARIMAResults.load('model.pkl')

    predictions = list()
    yhat = model_fit.forecast()[0]
    yhat = inverse_difference(history, yhat, hours_in_week)
    predictions.append(yhat)
    history.append(yhat)

    for i in range(1, hours):
        diff = difference(history, hours_in_week)

        model = ARIMA(diff, order=(1, 0, 0))
        model_fit = model.fit(trend='nc', disp=0)
        yhat = model_fit.forecast()[0]
        yhat = inverse_difference(history, yhat, hours_in_week)
        history.append(yhat)
        predictions.append(yhat)

    return predictions
def preprocess(device, channels):
    device_path = os.path.join(_data_path, device)
    print('Processing data for device:', device)
    print('Number of channels:', len(channels))

    if (len(channels) == 0):
        print('Invalid channel selection, Skipping')
        return

    for i in range(1, 25):
        case_num = '0' + str(i) if i < 10 else str(i)
        case = f'chb{case_num}'
        file_path = os.path.join(device_path, f'{case}.hdf')

        if os.path.isfile(file_path):
            print('File already exists:', file_path)
            print('Skipping case')
            continue

        print('Processing case:', case)
        df = preprocessing.get_data(case_num, channels=channels)
        print('Resulting table:')
        print(df)

        print('Writing to disk:', case)
        print('File path:', file_path)
        df.to_hdf(file_path, 'df')
Exemple #3
0
def main():

    # load BioBERT from Hugging Face
    file_name = "giacomomiolo/biobert_reupload"
    impressions, labels = get_data()
    biobert = BioBERT(file_name, impressions, labels)

    # get train and test data
    train_data, test_data = biobert.tokenize_and_split_data()
    model = MSNR(impressions, labels, biobert)
    model.layers[
        0].trainable = False  # freeze BioBERT layer to only train our classifier
    epoch_accuracy = []
    per_class_epoch_accuracy = []

    for i in range(model.epochs):
        train(model, train_data[0], train_data[1], train_data[2])
        print("epoch:", i, "/ 19")

    # print accuracies
    train_accuracy = model.cat_acc.result().numpy()
    print("Keras Categorical Accuracy (train)", train_accuracy)
    results = test(model, test_data[0], test_data[1], test_data[2])
    print("per class accuracy:", results[1])
    print("# of examples per class:", results[2])
    test_accuracy = model.cat_acc.result().numpy()
    print("Keras Categorical Accuracy (test)", test_accuracy)
Exemple #4
0
def train():
    

    strategy = tf.distribute.MirroredStrategy()

    (x_train, y_train), (x_test, y_test) = get_data() 
    with open('DataLoading.txt', 'a+') as f:
        App_Logger.log(f, 'Loaded data successfully...')


    callbacks = [keras.callbacks.TensorBoard(log_dir='./logs'), 
                    keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=1),
                    keras.callbacks.ReduceLROnPlateau(monitor='accuracy', factor=0.01, verbose=1)]


    try:
        with strategy.scope():
        
            K.clear_session()
            myModel = model.create_model()
            myModel.compile(loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                                                                      optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
            
        with open('DataLoading.txt', 'a+') as f:
            App_Logger.log(f, 'Created and compiled model....\n' + myModel.summary()) 
    
        history = myModel.fit(x_train, y_train, validation_split=0.25, callbacks=callbacks, verbose=1)
        
        
        with open('train.txt', 'a+') as f:
            App_Logger.log(f, 'Training successful ' + history.history)
    except Exception as e:
        with open('Error.txt', 'a+') as f:
            App_Logger.log(f, e)
Exemple #5
0
def submit(model, do_ensemble=False, num_ensembles=3):
    test_X = get_data(training=False)
    print(test_X.shape)
    # plt.imshow(test_X[32].reshape((28, 28)))
    # plt.show()

    if not do_ensemble:

        results = list(map(lambda pred: np.argmax(pred),
                           model.predict(test_X)))
        results = np.array(results)
        print(results.shape)
        print(results[:10])
        submission = pd.DataFrame({'Label': results}, list(range(1, 28001)))
        print(submission.head())

        submission.to_csv('submissions/submission.csv', index_label='ImageId')

    if do_ensemble:
        models = ensemble(num_ensembles, 30, 2000)

        results = np.zeros((test_X.shape[0], 10))
        for i in range(len(models)):
            results = results + models[i].predict(test_X)

        results = np.argmax(results, axis=1)
        print(results.shape)
        print(results[:10])
        submission = pd.DataFrame({'Label': results}, list(range(1, 28001)))
        # submission.to_csv('submissions/ensemble_prediction.csv', index_label='ImageID')
        submission.to_csv('ensemble_prediction.csv', index_label='ImageID')
Exemple #6
0
def get_prediction_accuracy(params):
    pred = pp.get_prediction(
        params, network.S_PATH + params['name'] + '_predictions.txt')
    _, Y = pp.get_data(params, params['dset_U'])
    if pred is not None and Y is not None:
        pred, Y = pp.get_tensor(pred, Y)
        acc = get_accuracy(pred, Y)

        log("Predicted Accuracy: %f." % (acc), name=params['log_name'])
def main():
    '''
    Read in MNIST data, initialize your model, and train and test your model
    for one epoch. The number of training steps should be your the number of
    batches you run through in a single epoch. You should receive a final accuracy on the testing examples of > 80%.
    :return: None
    '''

    # TODO: load MNIST train and test examples into train_inputs, train_labels, test_inputs, test_labels
    fr, km = get_data('COS071212_MOCAP.mat')

    indices = tf.range(0, len(fr))
    tf.random.shuffle(indices)
    fr = tf.gather(fr, indices)
    km = tf.gather(km, indices)

    eighty_p = int(len(fr) * 0.8)

    train_inp = fr[:eighty_p]
    train_lab = km[:eighty_p]

    test_inp = fr[eighty_p:]
    test_lab = km[eighty_p:]

    # TODO: Create Model
    model = Model(29)

    # TODO: Train model by calling train() ONCE on all data
    results = 0
    final_results = 0
    num_epochs = 200
    loss_list = []
    for i in range(num_epochs):
        print("EPOCH: ", i)
        indices = tf.range(0, len(train_inp))
        tf.random.shuffle(indices)

        train_inp = tf.gather(train_inp, indices)
        train_lab = tf.gather(train_lab, indices)
        print("training")
        train(model, train_inp, train_lab)

        # TODO: Test the accuracy by calling test() after running train()
        print("testing")
        results = test(model, test_inp, test_lab)
        loss_list.append(results)
        print("results: ", results)
        final_results += results

    epoch_list = tf.range(0, num_epochs)
    plt.xlabel('Epoch')
    plt.ylabel('Loss per Epoch')
    plt.title('Loss Between Predicted and Actual Kinematic Positions')
    plt.plot(epoch_list, loss_list)
    plt.show()
    print("final_results: ", final_results / num_epochs)
Exemple #8
0
def main():
    # import model and data
    X_train, X_test, y_train, y_test = get_data()
    model = get_model(data=(X_train, y_train))

    # evaluate it
    test_lost, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_accuracy:.2f}")

    return model
Exemple #9
0
def get_model(data=None):
    """
    Returns a model designed to work with the fashion_mnist dataset
    """
    # get training_data
    if data == None:
        data = get_data()

    # get X_train and y_train from data
    X_train = data[0]
    y_train = data[1]
    del data

    # Initialise model
    model = tf.keras.models.Sequential()

    # Add first fully-connected hidden layer
    # Fully-connected means all nodes are connected
    model.add(
        tf.keras.layers.Dense(
            units=128,  # number of neurons
            activation='relu',  # ReLU function
            input_shape=(X_train.shape[1], )  # number of pixels = 28*28
        ))

    # Add second layer with dropout
    # Dropout layer means some nodes are not updated during
    # back-propagation
    model.add(tf.keras.layers.Dropout(0.2))

    # EXPERIMENTAL - additional layer
    model.add(tf.keras.layers.Dropout(0.4))

    # Add output layer, activated using softmax
    model.add(
        tf.keras.layers.Dense(
            units=10,  # number of classes in the dataset (i.e 0-9)
            activation='softmax'))

    # compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['sparse_categorical_accuracy'])

    # get a summary
    model.summary()

    # train the model
    model.fit(X_train, y_train, epochs=10)

    return model
Exemple #10
0
def main():
    # path to training dataset
    train_path = "liar_dataset/train.tsv"
    test_path = "liar_dataset/test.tsv"
    valid_path = "liar_dataset/valid.tsv"

    data_train, data_test, labels_train, labels_test, subjects_train, subjects_test, word_index, unique_events = \
        get_data(train_path, test_path, valid_path, verbose=True)
    embedding_matrix = text_extract(word_index)

    extractor = Extractor_Model(embedding_matrix)
    detector = Detector_Model()
    discriminator = Discriminator_Model(unique_events)

    train(extractor, detector, discriminator, data_train, labels_train, subjects_train)

    test(extractor, detector, discriminator, data_test, labels_test, subjects_test)
Exemple #11
0
def main(
    model,
    time_stamp,
    expl_var,
    expt,
):

    X_train, X_valid,\
        y_train, y_valid = get_data(expt)

    pca = PCABasic(expl_var)
    pca.train(X_train.reshape(cfg.num_trains[expt], -1))

    sep()
    logging.info('\nExplained Variance: {}\nNum Components: {}'.format(
        str(expl_var),
        pca.num_components,
    ))

    model_ckpt = 'ckpts/{}/models/{}_{}.pkl'.format(expt, model, marker)
    sep()
    logging.info('Saving: {}'.format(model_ckpt))
    joblib.dump(pca, model_ckpt)
from statsmodels.tsa.arima_model import ARIMAResults
import preprocessing as pp
from arima import inverse_difference

series = pp.get_data()
hours_in_week = 168
model_fit = ARIMAResults.load('model.pkl')
yhat = (model_fit.forecast()[0])
yhat = inverse_difference(series.values, yhat, hours_in_week)
print("Predicted: %d" % yhat)
validate = pp.get_validate()
print(validate[0])
Exemple #13
0

def get_accuracy(y_hat, y_pred):
    '''
    returns the accuracy
    '''
    print(y_hat.shape)
    return 100 * np.sum(
        np.array(y_pred) == np.argmax(y_hat, axis=1)) / len(y_pred)


if __name__ == '__main__':

    # 1. Read in images (training and testing) and corresponding labels
    X, X_test, y, y_test = get_data(
        folder_training="data/GTSRB/Final_Training/Images/",
        folder_testing="data/GTSRB/Final_Test/Images/")

    # 2. Split into training and validation set
    X_train, X_val, y_train, y_val = split_train_validation(X, y)

    # 3. Create model and optimizer
    model = create_model()

    # 4. Train model with cross entropy and cross validation
    trained_model = train_model(model, X_train, X_val, y_train, y_val)

    # 5. Run test set on model
    y_pred = predict_labels(trained_model, X_test)
    print(get_accuracy(y_test, y_pred))
    # 6. Print out results (Confusion matrix, accuracy, training, validation and testing error)
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense
from keras.models import Model
from keras.callbacks import TensorBoard
import tensorflow as tf
from keras import regularizers, optimizers, backend as K
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from preprocessing import get_data
import Utils

train, test = get_data(encoding="Hash_encoder")

train_label = train.label.values
train.drop(["label"], axis=1, inplace=True)

#0 : normal, 1 : anomal
Scaler = StandardScaler()
train = Scaler.fit_transform(train.values)[np.where(train_label == 0)]

test, ytest = Scaler.transform(test.drop(["label"], axis=1)), test.label.values


#AUTOENCODER
def fit_model(X, lr=0.001, l2=0.001, ep=100, bs=50):
    input_dim = X.shape[1]
    latent_space_size = 15
Exemple #15
0
def train_C(params):

    # -------------------
    #  Parameters
    # -------------------

    log(str(params), name=params['log_name'])

    # # Clear remaining model
    # network.clear(params['name']+'_R'+str(params['start_run']))

    # -------------------
    #  CUDA
    # -------------------

    cuda = True if torch.cuda.is_available() else False
    C_Loss = torch.nn.BCELoss()

    if cuda:
        C_Loss.cuda()
        floatTensor = torch.cuda.FloatTensor
        log("CUDA Training.", name=params['log_name'])
    else:
        floatTensor = torch.FloatTensor
        log("CPU Training.", name=params['log_name'])

    # -------------------
    #  Data scaling
    # -------------------
    '''
    XTL ... Training data labelled
    XTU ... Training data unlabelled
    
    XL  ... Labelled data
    XU  ... Unlabelled data
    XV  ... Validation data
    '''

    dset_L = params['dset_L']
    dset_V = params['dset_V']

    XTL, YTL = pp.get_data(params, dset_L)
    XV, YV = pp.get_data(params, dset_V)

    XTL = pp.scale_minmax(XTL)
    XV = pp.scale_minmax(XV)

    if params['ratio_V'] < 1.0:
        XV, YV = pp.select_random(XV, YV, params['ratio_L'])
        log("Selected %s of validation samples." %
            (format(params['ratio_V'], '0.2f')),
            name=params['log_name'])
    XV, YV = pp.get_tensor(XV, YV)

    # -------------------
    #  Load accuracy
    # -------------------

    mat_accuracy_C = network.load_R_Acc(params)

    # -------------------
    #  Start Training
    # -------------------

    YF = None
    PF = None

    for run in range(params['runs']):

        # -------------------
        #  Training Data
        # -------------------

        XL, YL = XTL, YTL

        if params['ratio_L'] < 1.0:
            XL, YL = pp.select_random(XL, YL, params['ratio_L'])
            log("Selected %s of labelled samples." %
                (format(params['ratio_L'], '0.2f')),
                name=params['log_name'])

        count_L = YL.shape[0]
        log("Number of labelled samples = %d." % (count_L),
            name=params['log_name'])

        dataloader = pp.get_dataloader(params, XL, YL)

        C = network.load_Ref(run, params)

        # -------------------
        #  Optimizers
        # -------------------

        optimizer_C = torch.optim.Adam(C.parameters(),
                                       lr=params['CLR'],
                                       betas=(params['CB1'], params['CB2']))

        # -------------------
        #  Training
        # -------------------

        if run >= params['start_run']:

            if params['oversampling']:
                XL, YL = pp.over_sampling(params, XL, YL)
                log("Oversampling: created %d new labelled samples." %
                    (XL.shape[0] - count_L),
                    name=params['log_name'])

            for epoch in range(params['epochs']):

                # Jump to start epoch
                if run == params['start_run']:
                    if epoch < params['start_epoch']:
                        continue

                running_loss_C = 0.0

                for i, data in enumerate(dataloader, 1):

                    loss_C = []

                    # -------------------
                    #  Train the classifier on real samples
                    # -------------------
                    X1, Y1 = data
                    optimizer_C.zero_grad()
                    P1 = C(X1)
                    loss = C_Loss(P1, Y1)
                    loss_C.append(loss)
                    loss.backward()
                    optimizer_C.step()

                    # -------------------
                    #  Calculate overall loss
                    # -------------------
                    running_loss_C += np.mean([loss.item() for loss in loss_C])

                # -------------------
                #  Post Epoch
                # -------------------

                logString = "[Run %d/%d] [Epoch %d/%d] [C loss: %f]" % (
                    run + 1, params['runs'], epoch + 1, params['epochs'],
                    running_loss_C / (i))
                log(logString, save=False, name=params['log_name'])

                if (epoch + 1) % params['save_step'] == 0:
                    # log("~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~|",save=False,name=params['log_name'])
                    idx = run, int(epoch / params['save_step']) + 1

                    # Predict labels
                    PV = C(XV)

                    acc_C_real = get_accuracy(PV, YV)
                    mat_accuracy_C[idx] = acc_C_real

                    logString = "[Run %d/%d] [Epoch %d/%d] [C acc: %f ]" % (
                        run + 1, params['runs'], epoch + 1, params['epochs'],
                        acc_C_real)
                    log(logString, save=True, name=params['log_name'])

                    network.save_Ref(params['name'], run, C)
                    network.save_R_Acc(params, mat_accuracy_C)

                    params['start_epoch'] = epoch + 1
                    network.save_Parameter(params)

                    # log("~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~|",save=False,name=params['log_name'])

            # End of Training Run
            params['start_run'] = run + 1
            params['start_epoch'] = 0
            network.save_Parameter(params)

        # -------------------
        #  Post Run
        # -------------------

        # Classify Validation data
        PC = C(XV).detach()

        if YF == None:
            YF = YV
            PF = PC
        else:
            YF = torch.cat((YF, YV), 0)
            PF = torch.cat((PF, PC), 0)

    # -------------------
    #  Post Training
    # -------------------

    timeline = np.arange(0, params['epochs'] + 1, params['save_step'])

    # -------------------
    #  Plot Accuracy
    # -------------------

    acc_C = np.mean(mat_accuracy_C, axis=0)

    fig, ax = plt.subplots()

    legend = []
    cmap = plt.get_cmap('gnuplot')
    indices = np.linspace(0, cmap.N, 7)
    colors = [cmap(int(i)) for i in indices]

    ax.plot(timeline, acc_C, c=colors[0], linestyle='solid')
    legend.append("Accuracy $A_C$")

    ax.set_xlim(0.0, params['epochs'])
    ax.set_ylim(0.0, 1.0)

    ax.legend(legend)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy')

    ax.grid()
    save_fig(params, 'eval', fig)

    # -------------------
    #  Generate Confusion Matrix
    # -------------------

    YF = pp.one_hot_to_labels(params, YF)
    PF = pp.one_hot_to_labels(params, PF)

    con_mat = confusion_matrix(YF,
                               PF,
                               labels=None,
                               sample_weight=None,
                               normalize='true')
    plot_confusion_matrix(con_mat, params, name='C', title='Confusion matrix')

    # -------------------
    #  Log Results
    # -------------------

    log(" - " + params['name'] + ": [C acc: %f]" % (acc_C[-1]), name='results')
Exemple #16
0
# Pre-processing a document.
def preprocess_gensim(doc):
    """ preprocess raw text by tokenising and removing stop-words,special-charaters """
    doc = doc.lower()  # Lower the text.
    doc = word_tokenize(doc)  # Split into words.
    doc = [w for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    return doc

# Train a word2vec model with default vector size of 100
def train_word2vec(train_data,worker_no=3, vector_size=100,model_name="word2vec_model"):
    """ Trains a word2vec model on the preprocessed data and saves it . """
    if not train_data:
        print "no training data"
        return
    w2v_corpus = [preprocess_gensim(train_data[i]) for i in range(len(train_data))]
    model = Word2Vec(w2v_corpus, workers = worker_no, size=vector_size)
    model.save(model_name)
    print "Model Created Successfully"

# Load the Model
def load_model(path = "word2vec_model"):
    """ loads the stored  word2vec model """
    name = Word2Vec.load(path)
    return name

if __name__ == "__main__":
    train_data = get_data(sys.argv[1])
    train_word2vec(train_data)
Exemple #17
0
import pygal
from preprocessing import get_data, sortListToDicts

year = 2014
filename = 'YouthLiteracyRate.csv'
wm = pygal.maps.world.World()
wm.title = 'Literacy rate, youth total (% of people ages 15-24)'
country_name, country_code, data = get_data(filename, year)
no_data, less_than_50, less_than_75, more_than_75 = sortListToDicts(
    country_code, data)

wm.add("No data", no_data)
wm.add("< 50%", less_than_50)
wm.add("< 75%", less_than_75)
wm.add("> 75 %", more_than_75)

wm.render_to_file('test.svg')
from keras.layers import Input, Dense
from keras.models import Model
import tensorflow as tf
from keras import optimizers, regularizers, backend as K

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sn
# %matplotlib inline

from preprocessing import get_data
import Utils

#for NLS-KDD
train, test, indexes = get_data("multiclass")

train_label = train.label
train = train.drop(["label"], axis=1)

Scaler = StandardScaler()
train = Scaler.fit_transform(train.values)[np.where(train_label == 1)]

xtest, ytest = Scaler.transform(test.drop(["label"],
                                          axis=1)), test.label.values


def fit_model(params, X, latent=10, BS=250, ep=95):

    input_dim = X.shape[1]
    latent_space_size = latent
import catboost as cb

from hyperparam_optimizing import (
    CATBOOST_BAYESSEARCH_PARAMS,
    CATBOOST_RANDOMSEARCH_PARAMS,
    SCORING_LIST,
    perform_bayes_search,
    perform_random_search,
)
from preprocessing import get_data
from scoring import calculate_scores

VAL_SPLIT = 0.2
data = get_data(val_split=VAL_SPLIT, apply_label_encoding=True, fillna=True)
X_train, X_val, X_test, y_train, y_val, categorical_features = (
    data["X_train"],
    data["X_val"],
    data["X_test"],
    data["y_train"],
    data["y_val"],
    data["categorical_features"],
)
clf = cb.CatBoostClassifier(
    n_estimators=200,
    learning_rate=0.05,
    metric_period=500,
    od_wait=500,
    task_type="CPU",
    depth=8,
)
Exemple #20
0
import os
from preprocessing import get_data, vectorize_data
from gridsearchcv import train
from evaluate import get_acc, print_report, caculate_confidence, predict

# Path to file
root_path = os.path.dirname(__file__)

model_path = os.path.join(root_path, "result/model.sav")
report_path = os.path.join(root_path, "result/report.xlsx")

train_file = os.path.join(root_path, "data/train.txt")
test_file = os.path.join(root_path, "data/test.txt")

# Get data
X_train, y_train = get_data(train_file)
X_test, y_test = get_data(test_file)

# Vectorizer
X_train, y_train, X_test, y_test, vectorizer, le = vectorize_data(
    X_train, y_train, X_test, y_test)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test : {X_test.shape}\n")

print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test : {y_test.shape}\n")

print(f"Ratio: {len(X_train)/len(X_test)}")

# Training
Exemple #21
0
import matplotlib
matplotlib.use('Agg')
print('\n\n\nRunning\n\n\n')

from model import CNN
from preprocessing import get_data
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler

X, y = get_data(amount=42000)
im_shape = X[0].shape
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.1)
print(im_shape)
iterate = False
augment = True

if augment:
    datagen = ImageDataGenerator(rotation_range=10,
                                 zoom_range=0.1,
                                 width_shift_range=0.1,
                                 height_shift_range=0.1)

annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95**x)

if not iterate:

    def train_CNN(data_portion=X.shape[0], epochs=20, ensemble=False):
Exemple #22
0
#### model parameters ####    
cnn_layers = [32 , 64 , 128]
cnn_kernels = [3 , 3 , 3]
cnn_dropout = [.5 , .5 , .5]
lstm_layers = [128] 
lstm_dropout = [.5]
vector_size = 128
lr = 0.001
epochs = 20
batch_size = 64
ntest_sers = 1000
verbose = True
    
####### preprocessing and data ##########

data_csv_path = 'data/taonews.csv'
embedding_pretrained_model_path = 'data/glove.6B.100d.txt'



##################################

from preprocessing import get_data
from model import model, train

X, Y = get_data(data_csv_path,embedding_pretrained_model_path) 
model = model(X,Y,cnn_layers,cnn_kernels,cnn_dropout,lstm_layers,lstm_dropout,vector_size)
#training
train(model,X,Y,lr,epochs,batch_size,ntest_sers, verbose=True)

Exemple #23
0
def main():
    X_train, X_test, y_train, y_test = get_data(type_of_data='Default')
    score = nn(X_train, X_test, y_train, y_test)
    print(score)
from scipy.cluster.hierarchy import ward, dendrogram
import preprocessing
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics.pairwise import cosine_similarity

tfidf_matrix, titles, ranks, synopses, genres, vocab_frame, terms = preprocessing.get_data(
)
dist = 1 - cosine_similarity(tfidf_matrix)

linkage_matrix = ward(
    dist
)  #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20))  # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles)

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='False',      # ticks along the bottom edge are off
    top='False',         # ticks along the top edge are off
    labelbottom='False')

plt.tight_layout()  #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200)
Exemple #25
0
# coding: utf-8

# In[1]:

#get_ipython().system(u'jupyter nbconvert --to script Keras_Sentence_RNN.ipynb')
import preprocessing
import numpy as np

# In[2]:

import collections

sentence_max_threshold = 50000
tokenizer, max_sentence_len_word, labels, train_X, test_X, train_y, test_y = preprocessing.get_data(
    sentence_max_threshold)
print train_X.shape, train_y.shape, len(
    tokenizer.word_counts)  #, len(tokenized_text)

x_count = collections.Counter()
for i in range(len(test_y)):
    x_count.update({str(test_y[i]): 1})

for key, value in sorted(x_count.iteritems(), reverse=True):
    print key, value, float(value) / sentence_max_threshold

# ### Use Keras_Sentence_RNN.py to avoid time-out problem
# If the trained model runs too long, it will time out. To get around this issue, you can skip the run here and instead use Keras_Sentence_RNN.py to train and save the model, then load the saved model here to predict the data.
#

# In[5]:
Exemple #26
0
import networkx as nx

level = 3
numofkeys = 1  #the number of mainkeywords   #2**level -1 #sum of G.P. with common ratio = 2

from preprocessing import get_data, word_by_sent, wbys_to_word, word_to_idx, idx_by_sent
text = get_data()
wbys = word_by_sent(text)
wordlist = wbys_to_word(wbys)
wtoi = word_to_idx(wordlist)
ibys = idx_by_sent(wbys, wtoi)

from textrank import count_window, textrank_keyword
counter = count_window(ibys, 5)
mainkeywords = textrank_keyword(ibys, wordlist, numofkeys)

import visualization as vis
cnt_draw = vis.counter_draw(counter, wordlist)
IG = vis.initialGraph(cnt_draw, wordlist)
# vis.drawgraph(IG, cmap = "Blues", nodesize = 350, graphtype = None, savepath=None, show = True)

vis.communityGraph(IG)
# vis.drawgraph(IG, cmap = "Pastel1", nodesize = 350, graphtype = "community", savepath="community.png", show = False)

# energy_SG = vis.subGraph(IG, "energy")
# vis.drawgraph(energy_SG, cmap = "Oranges", nodesize = 350, graphtype = None, savepath="subgraph.png", show = False)

# energy_SCG = vis.subCommunityGraph(IG, "energy") #only after communityGraph() method
# vis.drawgraph(energy_SCG, cmap = "Pastel1", nodesize = 350, graphtype = "community", savepath="subcommunity.png", show = False)
"""
The core of this project:
Exemple #27
0
from sklearn import metrics
import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

x_train, y_train, x_test, y_test = preprocessing.get_data()

model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = np.asarray((y_pred))

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(classification_report(y_test, y_pred))
import numpy as np
import preprocessing
import postprocessing
import matplotlib.pyplot as plt
import sklearn, sklearn.tree, sklearn.model_selection, sklearn.ensemble

ftcount = 531

datafile = 'Dataset/dataset.train'

train = preprocessing.get_data(datafile, ftcount)

trainm = preprocessing.mask_unused_features(train)

x = []
meany = []

for t in range(2, 11):
    results = []

x = []
meany = []
sdy = []

for t in 2, 3, 4, 5, 6, 7, 8, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 150, 180, 200, 250:
    results = []
    for i in range(1, 5):
        rf = sklearn.ensemble.GradientBoostingClassifier(max_depth=t)

        cv_rf = sklearn.model_selection.cross_val_score(rf,
                                                        trainm[:, :-1],
import time

# Load Hyperparameters
epochs = params['epochs']
batch_size = params['batch_size']
rnn_size = params['rnn_size']
num_layers = params['num_layers']
encoding_embedding_size = params['encoding_embedding_size']
decoding_embedding_size = params['decoding_embedding_size']
learning_rate = params['learning_rate']
learning_rate_decay = params['learning_rate_decay']
min_learning_rate = params['min_learning_rate']
keep_probability = params['keep_probability']

# Preprocess data, get the vocabularies
questions, answers = get_data()
sorted_questions, sorted_answers, questionswords2int, answerswords2int = preprocess_data(
    questions, answers)

# Splitting the questions and answers into training and validation sets
training_validation_split = int(len(sorted_questions) * 0.15)
training_questions = sorted_questions[training_validation_split:]
training_answers = sorted_answers[training_validation_split:]
validation_questions = sorted_questions[:training_validation_split]
validation_answers = sorted_answers[:training_validation_split]

# Training
batch_index_check_validation_loss = (
    (len(training_questions)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
Exemple #30
0
# For correct argument parsing
def str2bool(arg):
    if isinstance(arg, bool):
        return arg
    if arg.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif arg.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


# Get datasets
print("Preparing data and tokenizer...")
train_data, validation_data, test_data, tokenizer = get_data()

# Initialize argument parser
parser = argparse.ArgumentParser()

# Model selection, device selection
parser.add_argument('--model',
                    type=str,
                    default="vae",
                    help='Select model to use')
parser.add_argument('--device',
                    type=str,
                    default=device,
                    help='Select which device to use')

# Standard model parameters
Exemple #31
0
import nltk

from preprocessing import get_data
from n_gram import count_n_grams,suggest_a_word

if __name__ == "__main__":
    tokenized_sentences , word_counts = get_data()
    
    print("building n-gram model 🚀🚀")
    unique_words = list(word_counts.keys())
    unigram_counts = count_n_grams(tokenized_sentences, 1)
    bigram_counts = count_n_grams(tokenized_sentences, 2) 
    print("Finshed building the model 🎯")
    
    print("Some results from the model 👀 :-")
    texts = ["how","i like","you","please","i need","give me your","allow us to"] 
    for text in texts:
        previous_tokens = nltk.word_tokenize(text)
        
        suggestion, max_prob = suggest_a_word(previous_tokens,
        unigram_counts, bigram_counts, unique_words, k=1.0)
        
        print("Text :",text)
        print("Suggestion :",suggestion)
        # print(f"Suggestion : {suggestion} -> {int(max_prob*100)}%")
        print("----------------------------------")