def load_data():
    '''
    Preprocess the datasets, split the data arrays into X's and y's.
    '''
    file_name1 = "yeast.dat"
    data1 = preprocessing(file_name1)
    Xdat1 = data1[:, :103]
    ydat1 = data1[:, 103:]
    n_dat1, _ = Xdat1.shape

    file_name2 = "scene.dat"
    data2 = preprocessing(file_name2)
    Xdat2 = data2[:, :294]
    ydat2 = data2[:, 294:]
    n_dat2, _ = Xdat2.shape

    #split the yeast data into training and test set
    proportion = 0.8
    n1 = math.floor(proportion * n_dat1)
    Xtrain1, ytrain1 = Xdat1[:n1, :], ydat1[:n1, :]
    Xtest1, ytest1 = Xdat1[n1:, :], ydat1[n1:, :]

    #split the scene data into training and test set
    n2 = math.floor(proportion * n_dat2)
    Xtrain2, ytrain2 = Xdat2[:n2, :], ydat2[:n2, :]
    Xtest2, ytest2 = Xdat2[n2:, :], ydat2[n2:, :]

    return (normalize(Xtrain1), ytrain1, normalize(Xtest1), ytest1,
            normalize(Xtrain2), ytrain2, normalize(Xtest2), ytest2)
Ejemplo n.º 2
0
def getInputData():
    # save in pickle
    fileName = 'emnist.p'

    if exists(fileName):
        # load pickle file
        trainData, trainLabel, one_hot_trainLabel, testData = pickle.load(
            open(fileName, mode='rb'))
        return trainData, trainLabel, one_hot_trainLabel, testData
    else:
        emnist = spio.loadmat("data/emnist-digits.mat")

        # load training dataset
        x_train = emnist["dataset"][0][0][0][0][0][0]
        x_train = x_train.astype(np.float32)

        # load training labels
        y_train = emnist["dataset"][0][0][0][0][0][1]
        # load test dataset
        x_test = emnist["dataset"][0][0][1][0][0][0]
        x_test = x_test.astype(np.float32)

        # load test labels
        y_test = emnist["dataset"][0][0][1][0][0][1]

        train_labels = y_train
        test_labels = y_test

        x_train = x_train.reshape(x_train.shape[0], 1, 28, 28, order="A")
        x_test = x_test.reshape(x_test.shape[0], 1, 28, 28, order="A")

        x_train = x_train.reshape(x_train.shape[0], 28 * 28)
        x_test = x_test.reshape(x_test.shape[0], 28 * 28)

        train = pd.DataFrame(y_train,
                             columns=["label"]).join(pd.DataFrame(x_train))
        test = pd.DataFrame(y_test,
                            columns=["label"]).join(pd.DataFrame(x_test))

        # cast to numpy array
        trainData = train.values[:, 1:]
        trainLabel = train.values[:, 0]
        testData = x_test

        processedTrainData = preprocessing(trainData)
        processedTestData = preprocessing(testData)
        one_hot_trainLabel = one_hot_encoding(trainLabel, 10)

        # save data to pickle
        if not isfile(fileName):
            pickle.dump((processedTrainData, trainLabel, one_hot_trainLabel,
                         processedTestData), open(fileName, 'wb'))
            return processedTrainData, trainLabel, one_hot_trainLabel, processedTestData

        return None
Ejemplo n.º 3
0
def get_tfidf_train_test(csv_filename, threshold=4.0):

    restaurants = pd.read_csv(csv_filename, encoding='utf-8', low_memory=False)

    review_list = restaurants['text']

    cleaned_list = [x for x in review_list if str(x) != 'nan']
    cleaned_df = pd.DataFrame(cleaned_list, columns=['review'])

    #update after preprocessing
    cleaned_df = preprocessing(cleaned_df, 'review')

    #set the targets based on threshold of 4.5 stars
    y = restaurants['stars'].apply(lambda u: 1 if u >= threshold else -1)

    #get the training and test data ready
    #TODO: kfold cv for train/test split
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(cleaned_df['review'],
                                                        y,
                                                        shuffle=True,
                                                        random_state=123)

    tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english')

    tfidf.fit(X_train)

    X_train = tfidf.transform(X_train).toarray()
    X_test = tfidf.transform(X_test).toarray()

    return X_train, y_train, X_test, y_test
def intent_classification(test_text):

    #load pickle file for tokenizer
    label_dict = pickle.load(open("label_encoder.pkl", "rb"))

    #clean customer input
    clean_test_text = preprocessing(test_text)

    #tokenize cusotmer input
    tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))
    test_sequences = tokenizer.texts_to_sequences([clean_test_text])
    test_input = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    #load pickle file for model
    model = pickle.load(open('model.pkl', 'rb'))

    #using intent classification model to classfy user input
    test_predictions_probas = model.predict([test_input])
    test_predictions = test_predictions_probas.argmax(axis=-1)

    #find intent
    intent = None
    for index, item in label_dict.items():
        if item == test_predictions[0]:
            intent = index
            break
    K.clear_session()
    return intent
Ejemplo n.º 5
0
def get_predict(model, img):
    processed_img = preprocessing(img)
    processed_img = np.expand_dims(processed_img, axis=0)
    out_put = model.predict(processed_img)
    out_put = out_put[..., -1]
    out_put = out_put[0]
    out_put = sklearn.preprocessing.binarize(out_put, threshold=0.5)
    out_put = out_put * 255.
    return out_put
Ejemplo n.º 6
0
 def btnOnClick(self):
     self.hide()
     link = self.lineEdit.text()
     print(link)
     data = parser(link)
     print(data)
     new_df = preprocessing(data)
     salary = predict(new_df, self.randomForest)
     self.secondMenu = SecondMenu(salary)
     self.secondMenu.show()
Ejemplo n.º 7
0
def predict(currentClassifier, filename_predict):
    print("Making Predictions...")
    results = preprocessing(filename_predict, False)
    model_file = 'scam_' + currentClassifier + '.pkl'
    clf_predict = joblib.load(model_file)
    y_true, y_pred = results['labels'], clf_predict.predict(
        results['features'])
    print("Accuracy Score: ", currentClassifier,
          accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print("Completed!")
def clean_comments(data_set):

    """
    Function to run the preprocessing function for each comment in the data set
    """

    cleaned_data_set = []
    for ds in data_set:
        ds.comment = preprocessing(ds.comment)
        cleaned_data_set.append((ds))
    return cleaned_data_set
Ejemplo n.º 9
0
def linearSVM(data):

    # get scaled training and testing sets alongside their labels
    trainSet, testSet, trainLabels, testLabels = preprocessing(data)

    # initialize linear SVM using sklearn
    SVClassifier = SVC(kernel='linear')

    # train the model
    SVClassifier.fit(trainSet, trainLabels)

    # predict using test set
    predictions = SVClassifier.predict(testSet)

    # compute accuracy and confusion matrix
    total = len(testSet)
    correct = 0
    confusionMatrix = confusion_matrix(testLabels, predictions)

    # loop through predictions and compute total accuracy
    for i in range(len(predictions)):
        if testLabels[i] == predictions[i]: correct += 1

    # print total accuracy
    print(correct / total)

    # print confusion matrix
    print(confusionMatrix)

    # print classification report provided by the sklearn library
    print(classification_report(testLabels, predictions))

    # initialize FPR and TPR variables to generate ROC curve
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    score = SVClassifier.fit(trainSet, trainLabels).decision_function(testSet)

    # get ROC curve data over time
    for i in range(len([0, 1])):
        fpr[i], tpr[i], _ = roc_curve(testLabels, score)
        fpr[i], tpr[i], _ = roc_curve(testLabels, predictions)
        roc_auc[i] = auc(fpr[i], tpr[i])

    # generate ROC curve using matplotlib
    plt.figure()
    plt.plot(fpr[1], tpr[1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
Ejemplo n.º 10
0
def main():
    # Reads the probeA csv file into a panda dataframe
    probeAData = pd.read_csv('../probeA.csv')

    # This chunk of code calls orderData, which goes through the passed in
    # headers, swapping values such that the first header has all the smallest
    # values, the middle with the middle value and the last header with the largest
    # value. This was done as after inspecting the data, where it became apparent this
    # was a common trend in the data, with them occasionally being out of order.
    # After swapping these around, an improvement was seen in the AUC score, so was kept
    firstChemOrdered = orderData(probeAData, 'c1', 'c2', 'c3')
    secondChemOrdered = orderData(firstChemOrdered, 'm1', 'm2', 'm3')
    thirdChemOrdered = orderData(secondChemOrdered, 'n1', 'n2', 'n3')
    fourthChemOrdered = orderData(thirdChemOrdered, 'p1', 'p2', 'p3')
    probeAData = fourthChemOrdered.copy()

    # After swapping the data around into a fixed order, the data is then passed to preprocessing
    # As the name suggests, this method performs preprocessing on the data, returning a
    # dataframe containing the preprocessed data
    # 'tna' is passed into the method to ensure that it isn't included in the preprocessing
    # step. As, testing showed that performing preprocessing steps on tna lead to a decrease
    # in performance in terms of accuracy and AUC
    probeAPreprocessed = preprocessing(probeAData, 'tna')

    # Read in the classA csv file into a panda dataframe
    probeAResults = pd.read_csv('../classA.csv')

    # Here the tna column, (which is dropped in the preprocessing call as it isn't to have
    # any preprocessing run on it), is concatenated with the preprocessed data and the contents
    # of the classA csv file (which contains the classification assignments)
    probeAConcatenated = pd.concat(
        [probeAData['tna'], probeAPreprocessed, probeAResults], axis=1)
    # probeAConcatenated = pd.concat([probeAScaled, probeAResults], axis=1)

    bestModel = [0, 0, 0]
    tLabel = 'class'
    # Defines the number of splits we're going to use on the training data. In this case we use 10 meaning we will
    # Produce 10 folds, 900 training 100 validation
    splits = 10
    # Run cross validation for depth in range 1 to 20
    for depth in range(1, 20):
        #calls crossvalidation for given depth level with a fixed number of 100 estimators
        result = crossValidation(probeAConcatenated, depth, 200, splits,
                                 tLabel)
        if (result[0] > bestModel[0]):
            bestModel = result
    model = bestModel[2]
    # Simple print statement which prints the overall best model's AUC and the depth it used.
    print("BEST MODEL PREDICTED: ")
    print(
        str(bestModel[0]) + "______" + str(bestModel[1]) + "______" +
        str(bestModel[3]))
    print(model.feature_importances_)
Ejemplo n.º 11
0
def combineImage(randImage):  # combine 10 images to 1 image
    #preprocessing: rescale image from min to max
    def preprocessing(image):
        return min_max_scaler.fit_transform(image)

    randImage = restoreImg(randImage)
    randImage = [pad(preprocessing(img)) for img in randImage]
    img_top = randImage[0]
    for i in range(1, 5):
        img_top = np.append(img_top, randImage[i], axis=1)
    img_bottom = randImage[5]
    for i in range(6, 10):
        img_bottom = np.append(img_bottom, randImage[i], axis=1)
    return np.append(img_top, img_bottom, axis=0)
Ejemplo n.º 12
0
    def testPreprocessing():

        fichier_data = "public_data/perso_train.data"  #on va cherhcer les données d'entrainement
        datas = np.loadtxt(
            fichier_data, separateur=" "
        )  #on va cherhcer les données d'entrainement, chaque donnée est delimité par " "
        prepro = preprocessing()
        Tab = prepro.fit(datas)  #on preprocess des données
        transf_Tab = prepro.transform(Tab)
        #on verifie que le preprocessing a agit sur les données, c'est a dire s'il y a des données censuré
        censured_data = 0
        for i in range(0, transf_Tab.lenth):
            if i == 1:
                print(i, " est Censuré")
                censured_data += 1

        if censured_data != 0:
            return True
Ejemplo n.º 13
0
def main():
    X, y = load_data("spambase.data")
    X = preprocessing(X)
    k = 10

    model = svm_model()
    model = NN_model()
    from sklearn.model_selection import cross_validate
    from sklearn.metrics import make_scorer, accuracy_score
    from sklearn.metrics import confusion_matrix

    def tn(y_true, y_pred):
        return confusion_matrix(y_true, y_pred)[0, 0]

    def fp(y_true, y_pred):
        return confusion_matrix(y_true, y_pred)[0, 1]

    def fn(y_true, y_pred):
        return confusion_matrix(y_true, y_pred)[1, 0]

    def tp(y_true, y_pred):
        return confusion_matrix(y_true, y_pred)[1, 1]

    scoring = {
        'Accuracy': 'accuracy',
        'Precision': 'precision',
        'Recall': 'recall',
        'tp': make_scorer(tp),
        'tn': make_scorer(tn),
        'fp': make_scorer(fp),
        'fn': make_scorer(fn)
    }
    cv_results = cross_validate(model,
                                X,
                                y,
                                scoring=scoring,
                                cv=k,
                                n_jobs=-1,
                                return_train_score=False)

    table(cv_results, k, 5)
Ejemplo n.º 14
0
def gradDscent(xArr, yArr):
    if scale == True:
        xArr = preprocessing(xArr)
    xMat = np.mat(xArr)
    yMat = np.mat(yArr)
    lr = 0.03
    epochs = 50000
    costList = []
    #计算数据列数,有几列就有几个权值
    m, n = np.shape(xMat)
    #初始化权值
    ws = np.mat(np.ones((n, 1)))
    for i in range(epochs+1):
        # xMat和weights矩阵相乘
        h = sigmoid(xMat*ws)
        # 计算误差
        ws_grad = xMat.T*(h-yMat)/m
        ws = ws - lr * ws_grad
        if i % 50 == 0:
            costList.append(cost(xMat, yMat, ws))
    return ws, costList
def predict_note_authentication(sentence):

    w1 = preprocessing(sentence)

    list1 = []
    list1.append(w1)

    transformer = TfidfTransformer()
    loaded_vec = CountVectorizer(decode_error="replace",
                                 vocabulary=pickle.load(
                                     open("feature.pkl", "rb")))
    tfidf = transformer.fit_transform(
        loaded_vec.fit_transform(list1)).toarray()

    prediction = classifier.predict(tfidf)

    if (prediction == 0):
        return "a not Greeting"

    elif (prediction == 1):
        return "a Greeting"
Ejemplo n.º 16
0
def logisticRegression(data):

    # get training and testing sets alongside their labels
    trainSet, testSet, trainLabels, testLabels = preprocessing(data)

    # initialize logistic regression using sklearn
    logisticRegr = LogisticRegression()

    # train the model
    logisticRegr.fit(trainSet, trainLabels)

    # predict using test set
    predictions = logisticRegr.predict(testSet)

    # print classification report provided by the sklearn library
    print(classification_report(testLabels, predictions))

    # compute accuracy
    score = logisticRegr.score(testSet, testLabels)
    print(score)

    # compute confusion matrix
    confusionMatrix = confusion_matrix(testLabels, predictions)
    print(confusionMatrix)
Ejemplo n.º 17
0
'''
lectura de datos
'''
print("----- Leyendo datos ...")
#los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos)
data_x = pd.read_csv('../data/nepal_earthquake_tra.csv')
data_y = pd.read_csv('../data/nepal_earthquake_labels.csv')
data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv')
df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv')

#se quitan las columnas que no se usan
data_x.drop(labels=['building_id'], axis=1, inplace=True)
data_x_tst.drop(labels=['building_id'], axis=1, inplace=True)
data_y.drop(labels=['building_id'], axis=1, inplace=True)

X, X_tst = preprocessing(data_x, data_x_tst)
y = np.ravel(data_y.values)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)

print("------ Stacking...")

estimators = [('lgbm',
               lgb.LGBMClassifier(objective='regression_l1',
                                  n_jobs=-1,
                                  n_estimators=1000,
                                  num_leaves=80,
                                  scale_pos_weight=0.05,
                                  verbose=2)),
              ('rf',
               RandomForestClassifier(random_state=123456,
Ejemplo n.º 18
0
    lr_scores = cross_val_score(linear_model.LogisticRegression(), X, y, scoring ='accuracy', cv = 10 )
    rf_scores = cross_val_score(RandomForestClassifier(), X.toarray(), y, scoring ='accuracy', cv = 10 )
    logging.info("CV: Accuracy of Logistic Regression is %.4f\n, and Accuracy of Random Forest is %.4f\n." % (lr_scores.mean, rf_scores.mean))


if __name__ == '__main__':

    # arguments: set ndays and sector
    ndays = 1
    sector = 'Financials'
    filenameX = '%s/stock_%s_X.txt' % (DATA_DIR, sector)
    filenameY = '%s/stock_%s_Y_%sdays.txt' % (DATA_DIR, sector, ndays)

    X = openfiles(filenameX, 100)
    y = openfiles(filenameY, 1) # arg = 1: SNP500
    X, y = preprocessing(X, y, arg=1)
    ids = X['id']
    numX = X.ix[:,1:6].copy()
    numX.index = ids

    docs = tokenizing(list(X['text']), mode='tf') # term doc matrix
    logging.info(docs.shape)
    # docX = pd.DataFrame(docs, index=ids).to_sparse().sort_index()
    docX = pd.SparseDataFrame([pd.SparseSeries(docs[i].toarray().ravel()) for i in np.arange(docs.shape[0])],\
                index =ids).sort_index()

    X =concat([numX.sort_index(), docX], axis =1)
    # print(X[0])
    # raise
    # # X = numX.sort_index()
    y = y.sort_index()
Ejemplo n.º 19
0


#plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
#plot_svc_decision_function(model);

# check solves with enough numerical precision







from sklearn.metrics import zero_one_loss
X,y = preprocessing(train_data, 4.0)
E_in = []
for c in C:
    clf = svm.SVC(C = c, kernel = 'poly', gamma = 1 ,degree = 2)
    clf.fit(X, y) 
    s = zero_one_loss(y, clf.predict(X), normalize=False)
    print(s)
    E_in.append(s)

plt.plot(np.log10(C),E_in)
plt.scatter(np.log10(C), E_in)
plt.show()

# Try reducing C (try C= 0.001,0.01,0.1). C is the penalty parameter and as C gets bigger, the model tries to reduce the penalty, and so takes more time to train.

Ejemplo n.º 20
0
'''
lectura de datos
'''
print("----- Leyendo datos ...")
#los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos)
data_x = pd.read_csv('../data/nepal_earthquake_tra.csv')
data_y = pd.read_csv('../data/nepal_earthquake_labels.csv')
data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv')
df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv')

#se quitan las columnas que no se usan
data_x.drop(labels=['building_id'], axis=1,inplace = True)
data_x_tst.drop(labels=['building_id'], axis=1,inplace = True)
data_y.drop(labels=['building_id'], axis=1,inplace = True)

X, X_tst, selec = preprocessing(data_x, data_x_tst)
y = np.ravel(data_y.values)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)
    
print("------ XGB...")
xgboost = xgb.XGBClassifier(predictor='cpu_predictor', n_gpus=0, n_estimators = 700, eta = 0.1, max_depth = 10, verbose=2)
xgboost, y_test_lgbm = validacion_cruzada(xgboost, X, y, skf)


# Entreno de nuevo con el total de los datos
# El resultado que muestro es en training, será mejor que en test
clf = xgboost
clf = clf.fit(X,y)
plotImp(clf, selec, X.shape[1])
y_pred_tra = clf.predict(X)
def main():
    # Defines the filename in which the models weights/paramaters have been pre-calculated and stored
    filename = "predictTNAModel.sav"
    model = None
    try:
        # If the model has been precomputed this will not throw an exception
        model = pickle.load(open(filename, 'rb'))
    except Exception as e:
        # If the model has not been precomputed, the filename will not exist, cannot be opened and thus throw an error
        # This will be caught and come into this section, where the models will then be calculated and stored in the given
        # Filename, meaning in the future they need not be calculated again

        tLabel = 'tna'

        # Reads the probeA csv file into a panda dataframe
        probeAData = pd.read_csv('../probeA.csv')

        # This chunk of code calls orderData, which goes through the passed in
        # headers, swapping values such that the first header has all the smallest
        # values, the middle with the middle value and the last header with the largest
        # value. This was done as after inspecting the data, where it became apparent this
        # was a common trend in the data, with them occasionally being out of order.
        # After swapping these around, an improvement was seen in the R2 score, so was kept
        firstChemOrdered = orderData(probeAData, 'c1', 'c2', 'c3')
        secondChemOrdered = orderData(firstChemOrdered, 'm1', 'm2', 'm3')
        thirdChemOrdered = orderData(secondChemOrdered, 'n1', 'n2', 'n3')
        fourthChemOrdered = orderData(thirdChemOrdered, 'p1', 'p2', 'p3')
        probeAData = fourthChemOrdered.copy()

        # After swapping the data around into a fixed order, the data is then passed to preprocessing
        # As the name suggests, this method performs preprocessing on the data, returning a
        # dataframe containing the preprocessed data
        # 'tna' is passed into the method to ensure that it isn't included in the preprocessing
        # step. As, testing showed that performing preprocessing steps on tna lead to a decrease
        # in performance in terms of R2
        probeAScaled = preprocessing(probeAData, tLabel)

        # A dataFrame 'probeAConcatenated' is produced containing the tna and preprocessed data
        # Which will be used for training
        probeAConcatenated = pd.concat([probeAData['tna'], probeAScaled],
                                       axis=1)
        probeATraining = probeAScaled.copy()

        # Defines number of splits/folds to use
        splits = 10

        # This creates an array of 800 alpha values to test in a range between 0.00000001 and 2 which are produced non-linearly
        alphas = np.geomspace(0.00000001, 2, 800)

        # The best model is retrieved by running crossValidation on the data, with a defined
        # number of splits, with a given target label with the defined set of alphas
        bestModelResult = crossValidation(probeAConcatenated, splits, tLabel,
                                          alphas)

        model = bestModelResult[0]
        result = bestModelResult[1]
        bestAlpha = bestModelResult[2]
        print("BEST MODEL - " + str(result) + "__" + str(bestAlpha))

        #Can use TNA in prediction of t1, but need to check effect of this
        #On average the prediction is 0.14 out, so make gaussian distribution ot mdoel noise of 0.14 and apply to
        #task 1 tna column and check results
        pickle.dump(model, open(filename, 'wb'))

    # Now the model has either been loaded or computed we can start loading in the data needed for predictions
    # ProbeB csv file is opened and read into a panda dataFrame
    probeBData = pd.read_csv('../probeB.csv')

    #This section shuffles the data in the columns to assure that the values appear in the order of:
    # col1val <= col2val <= col3val
    BfirstChemOrdered = orderData(probeBData, 'c1', 'c2', 'c3')
    BsecondChemOrdered = orderData(BfirstChemOrdered, 'm1', 'm2', 'm3')
    BthirdChemOrdered = orderData(BsecondChemOrdered, 'n1', 'n2', 'n3')
    BfourthChemOrdered = orderData(BthirdChemOrdered, 'p1', 'p2', 'p3')
    probeBData = BfourthChemOrdered.copy()

    # Performs preprocessing on the probeBData
    # 'none' is passed as no column needs to be dropped (as it doesn't contain a tna column)
    probeBScaled = preprocessing(probeBData, 'none')

    # The model predicts the probabilities for the given data
    predictions = model.predict(probeBScaled)

    # The predictions are then printed to a tnaB csv file with a header of "tna"
    dataFramePredictions = pd.DataFrame(predictions, columns=["tna"])
    dataFramePredictions.to_csv('tnaB.csv', index=False)
Ejemplo n.º 22
0
    output: scaled numpy array
    '''
    minV = 0
    maxV = 255
    data = (data - minV) / (maxV - minV)
    return data


def one_hot_encoding(data, numberOfClass):
    from sklearn import preprocessing
    lb = preprocessing.LabelBinarizer()
    lb.fit(range(numberOfClass))
    return lb.transform(data)


processedTrainData = preprocessing(trainData)
processedTestData = preprocessing(testData)
one_hot_trainLabel = one_hot_encoding(trainLabel, 10)

# save in pickle
fileName = 'mnist.p'
if not isfile(fileName):
    pickle.dump((processedTrainData, trainLabel, one_hot_trainLabel,
                 processedTestData), open(fileName, 'wb'))
    # load pickle file
fileName = 'mnist.p'


def getInputTensor(features, numberOfClass):
    '''
    Create tf.placeholder for input & label
Ejemplo n.º 23
0
            img = make_img(drawing[draw_num])
            img = np.array(img.resize((32, 32))).reshape(32, 32, 1)
            X.append(img)
            Y.append(Y_num)
        Y_num += 1
    tmpx = np.array(X)

    Y = np.array([[i] for i in Y])
    enc = OneHotEncoder(categories='auto')
    enc.fit(Y)
    tmpy = enc.transform(Y).toarray()

    return tmpx, tmpy, class_label


X_train, Y_train, class_label = preprocessing(filenames)
print('\n', X_train.shape, Y_train.shape, '\n', class_label)
#df.head()
#print(drawing[0])
#img = make_img(drawing[1])
#plt.imshow(img)

import tensorflow.compat.v1 as tf
tf.disable_eager_execution()  #like tensorflow v1 activate

learning_rate = 0.001
training_epochs = 30
batch_size = 100

X = tf.placeholder(tf.float32, [None, 32, 32, 1], name='input')
Y = tf.placeholder(tf.float32, [None, 340], name='output')
Ejemplo n.º 24
0
def main():     
    import matplotlib.pyplot as plt 

    splitVal = 0.99
    TicksIntoFuture = 1
    TicksIntoPast = 54 #8days => 8[day]*24[std/day] = 192[std]
    ##the present is not included into this value hence TicksIntoPast can be 0
    #and the batch size is TicksIntoPast+1
    
    batch_sizes = 128
    epochs = 45
    
    
    print("Data will be shaped acording to tensor flows [batch, time, features] ... windows")
    
    featureListRaw = ["Date","Open","High","Low","Close"]
    #labelListTimeShifted = ["High","Low","Close"]
    labelListTimeShifted = ["Open","High","Low","Close"]
    #labelListTimeShifted = ["Open","High","Low","Close"]
    
    featListUnScale = ["Date"]
    featListScale = ["Open","High","Low","Close"]
    featureList = ["DaySin","DayCos","Open","High","Low","Close"]
    #featList.append("Day")
    #featList.append("Hour")
    
    pp = preprocessing(ticksIntoPast=TicksIntoPast,ticksIntoFuture=TicksIntoFuture)
    data = pp.pullData('BTC-h.csv',0,featureListRaw,0)
    #data = pp.pullData('dbg.csv',0,featureListRaw,0)
    
    print("\n\nShrink Data \n==============")
    UnscaleData = data[featListUnScale]
    data = data[featListScale]
    print(data)
    
        
    data = pp.scaleData(data,'standardize')
    
    
    print("\n\nTODO: ADD Additional Features that are not scaled \n==============")
    data[featListUnScale] = UnscaleData
    print(data)
    
    print("\n\nAdding Date\n==============")
    databuffer = data["Date"]
        #s[0] = '2017-07-29 03-PM'
        #s[1] = '2017-07-30 01-PM'
    try:
        databuffer = pd.to_datetime(databuffer.values).to_series()
    except:
        try:
            databuffer = pd.to_datetime(databuffer.values,format='%Y-%m-%d %I-%p').to_series()    
        except:
            print("timestamp not known")
            

    
    data["Day"] = databuffer.dt.dayofweek.values #data["Date"]
    data["Hour"] = databuffer.dt.hour.values 
    
    
    
    
    print("==============")
    
    
    #timestamp_s = date_time.map(datetime.datetime.timestamp)
    print("timestamp_s defines how many ticks per day\nBTC-h.csv has 1h tick\n==============")
    #timestamp_s = np.linspace(0, 1, num=24)
    #date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')
    
    databuffer = data["Date"].values
    try:
        databuffer = pd.to_datetime(databuffer)
    except:
        try:
            databuffer = pd.to_datetime(databuffer,format='%Y-%m-%d %I-%p')  
        except:
            print("timestamp not known")
        
    databuffer = pd.to_datetime(databuffer, format='%d.%m.%Y %H:%M:%S')
            
    print("\n\nDataBuffer\n==============")
    print(databuffer)
    
    timestamp_s = databuffer.map(datetime.datetime.timestamp)
    print(timestamp_s)
    
    day = 24*60*60
    week = 7*day
    year = (365.2425)*day
    
    data['DaySin'] = np.sin(timestamp_s * (2 * np.pi / day))
    data['DayCos'] = np.cos(timestamp_s * (2 * np.pi / day))
    
    data['WeekSin'] = np.sin(timestamp_s * (2 * np.pi / week))
    data['WeekCos'] = np.cos(timestamp_s * (2 * np.pi / week))
    

    
    print("==============")
    print(data['WeekSin'][0:25])
    #print(data['Date'][0:25])
    print("==============\n")
    
    #print(data)
    
    
    
    
    [data,y] = pp.genForcastY(data, LabelList=labelListTimeShifted, featureList=featureList, includeAllFuturDays=False)
    [data,y] = pp.genTimeSeries(data,y)
    
    print(y)
    
    print("\n\nAmount of ouputs \n==============")
    N_outPutFeatures = y.shape[1]
    y = np.expand_dims(y, -1)
    print(y.shape)
    
    
    dataSize = int(data.shape[0] * splitVal) 
    x1_train = data[:dataSize]
    x1_eval = data[dataSize:]
    
    y1_train = y[:dataSize]
    y1_eval = y[dataSize:]
    
    print(data.shape)
    print(y.shape)
    
    print(x1_train.shape)
    print(y1_train.shape)
    print(x1_eval.shape)
    print(y1_eval.shape)
    
    print("\n\nHook up Models \n==============")
    
    m1 = model1(x1_train.shape,y1_train.shape[1])
    #m1 = m.call(x1_train,y1_train) #batch_sizes,epochs
    
    m1.compile(optimizer='adam', loss='mse')
    m1.fit(x=x1_train, y=y1_train, batch_size=batch_sizes, epochs=epochs, shuffle=True, validation_split=0.1) #shuffle=True, validation_split=0.1
    
    scores = m1.evaluate(x1_eval, y1_eval)
    print(scores)

    
    
    print("\n\nPlot it\n==============")
    fig, axs = plt.subplots(N_outPutFeatures,3, figsize=(25,14))
    
    y1_predict = m1.predict(x1_eval)
    y1_predict2 = m1.predict(x1_train)
    
    plotX = np.linspace(0, 10, y1_eval.shape[0])
    plotX2 = np.linspace(0, 10, y1_train.shape[0])
    
    print("\n\ndebg \n==============")
    print(y1_eval.shape)
    print(y1_predict.shape)
    print(y1_train.shape)
    print(y1_predict2.shape)
    
    y1_eval[:,]
    Variance = np.array([[]])
    
    for n in range(N_outPutFeatures):
        yDiv = y1_eval[:,n,0] - y1_predict[:,n]
        yDiv = np.abs(yDiv) 
        #print("shape yDiv")
        #print(yDiv.shape)
        #ydum = np.where(y1_eval[:,n,0] == 0, 10000, y1_eval[:,n,0])
        
        #ydum = y1_eval[:,n,0] + 1
        #ydum2 = yDiv + 1
        #ERROR_ = (100/ydum) * ydum2
        ERROR_ = yDiv
        #print("shape ERROR_")
        #print(ERROR_.shape)
        yDiv = np.sum(yDiv)/len(y1_predict[:,n])
        
        Variance = np.append(Variance, np.array([yDiv,yDiv]))
        
        axs[n,0].plot(plotX,y1_eval[:,n], 'k')
        axs[n,0].plot(plotX,y1_predict[:,n], 'g')
        
        axs[n,1].plot(plotX,ERROR_, 'r')
        
        axs[n,2].plot(plotX2,y1_train[:,n], 'k')
        axs[n,2].plot(plotX2,y1_predict2[:,n], 'g')
        
    
    plt.show()
    
    fig2 = go.Figure(data=[go.Candlestick(x=plotX,
                open=y1_eval[:,0],
                high=y1_eval[:,1],
                low=y1_eval[:,2],
                close=y1_eval[:,3])])

    fig2.show()
    
    fig3 = go.Figure(data=[go.Candlestick(x=plotX,
                open=y1_predict[:,0],
                high=y1_predict[:,1],
                low=y1_predict[:,2],
                close=y1_predict[:,3])])

    fig3.show()
    
    
    print("\n\ndebg \n==============")
    print("Variance list {}".format(Variance))
    
    
    
    
    '''
Ejemplo n.º 25
0
'''
lectura de datos
'''
print("----- Leyendo datos ...")
#los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos)
data_x = pd.read_csv('../data/nepal_earthquake_tra.csv')
data_y = pd.read_csv('../data/nepal_earthquake_labels.csv')
data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv')
df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv')

#se quitan las columnas que no se usan
data_x.drop(labels=['building_id'], axis=1, inplace=True)
data_x_tst.drop(labels=['building_id'], axis=1, inplace=True)
data_y.drop(labels=['building_id'], axis=1, inplace=True)

X, X_tst, y = preprocessing(data_x, data_x_tst, data_y)
#y = np.ravel(data_y.values)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456)

print("------ LightGBM...")
lgbm = lgb.LGBMClassifier(objective='regression_l1',
                          n_estimators=200,
                          n_jobs=2,
                          num_leaves=45,
                          scale_pos_weight=0.1)
lgbm, y_test_lgbm = validacion_cruzada(lgbm, X, y, skf)

# Entreno de nuevo con el total de los datos
# El resultado que muestro es en training, será mejor que en test
clf = lgbm
Ejemplo n.º 26
0
def allinone():
    train_data = valuetoint(load_df('http://test.blueathiean.me/train.csv'))
    test_data = valuetoint(load_df('http://test.blueathiean.me/test.csv'))

    train_data['transactionRevenue'].fillna(value='0', inplace=True)
    train_data['transactionRevenue'] = train_data['transactionRevenue'].astype(
        int)

    train_data, test_data = preprocessing(train_data, test_data)

    train_data = encoder(train_data)
    test_data = encoder(test_data)

    train_data, test_data = one_hot_encode(train_data, test_data)

    train_data = datetimeconvert(train_data)
    test_data = datetimeconvert(test_data)
    train_staging, test_staging = delcols(train_data, test_data)

    train_staging, test_staging = fillnans(train_staging, test_staging)

    train_staging, test_staging = train_staging.align(test_staging,
                                                      join='inner',
                                                      axis=1)
    train_staging['transactionRevenue'] = train_data['transactionRevenue']
    test_staging['fullVisitorId'] = test_data['fullVisitorId']
    train_staging['fullVisitorId'] = train_data['fullVisitorId']

    train_agg = train_staging \
        .groupby(['fullVisitorId']) \
        .agg(['count','mean','min','max','sum']) \
        .reset_index()

    test_agg = test_staging \
        .groupby(['fullVisitorId']) \
        .agg(['count','mean','min','max','sum']) \
        .reset_index()

    columns_train = ['fullVisitorId']
    for var in train_agg.columns.levels[0]:
        if var != 'fullVisitorId':
            for stat in train_agg.columns.levels[1][:-1]:
                columns_train.append('%s_%s' % (var, stat))

    train_agg.columns = columns_train

    columns_test = ['fullVisitorId']

    for var in test_agg.columns.levels[0]:
        if var != 'fullVisitorId':
            for stat in test_agg.columns.levels[1][:-1]:
                columns_test.append('%s_%s' % (var, stat))

    test_agg.columns = columns_test

    del train_staging
    del train_data

    del test_staging
    del test_data

    train_agg['TARGET'] = train_agg['transactionRevenue_sum'].apply(
        create_target)

    train_agg = train_agg.drop([
        'transactionRevenue_count', 'transactionRevenue_mean',
        'transactionRevenue_min', 'transactionRevenue_max',
        'transactionRevenue_sum'
    ],
                               axis=1)
    train_agg_corr = train_agg.corr()

    #CORRELATION CHECK
    #if they click around the site more, it's more likely it will end up in a transaction
    print(train_agg_corr['TARGET'].sort_values(ascending=False))
    train_agg.to_csv('application.csv')
    test_agg.to_csv('test_agg.csv')
Ejemplo n.º 27
0
def preprocessing(data):
    minV=0
    maxV=255
    data=(data-minV)/(maxV-minV)
    return data


def one_hot_encoding(data,numberOfClass):
    from sklearn import preprocessing
    lb=preprocessing.LabelBinarizer()
    lb.fit(range(numberOfClass))
    return lb.transform(data)


processedTrainData=preprocessing(trainData)
processedTestData=preprocessing(testData)
one_hot_trainlabel=one_hot_encoding(trainLabel,10)


fileName='mnist.p'
if not isfile(filename):
    pickle.dump((processedTrainData, trainLabel, one_hot_trainLabel, processedTestData),open(fileName,'wb'))
    
trainData, trainLabel, one_hot_trainLabel, testData = pickle.load(open(fileName, mode = 'rb'))



    

    # removing length less than 3
    tokens = [word for word in tokens if len(word)>=3]
    pre_proc_text = " ".join(tokens)
    return pre_proc_text


lines = []
fin = open(data_path+"Smart_Bomb_with_Language_parser.txt", "rb")
#fin = open(data_path+"alice_in_wonderland.txt", "rb")

for line in fin:
    line = line.strip().decode("ascii", "ignore").encode("utf-8")

    if len(line) == 0:
        continue
    lines.append(preprocessing(line))
fin.close()


import collections
counter = collections.Counter()

for line in lines:
    for word in nltk.word_tokenize(line):
        counter[word.lower()]+=1

word2idx = {w:(i+1) for i,(w,_) in enumerate(counter.most_common())}
idx2word = {v:k for k,v in word2idx.items()}


# Window size to control the nearest words vicinity
    output.write("RF, N-Gram Vectors: ")
    output.write(str(accuracy13))
    output.write("\n")
    output.write("XGB, N-Gram Vectors: ")
    output.write(str(accuracy14))
    output.close()


#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

if __name__ == "__main__":

    corpusMake(1)
    #corpus = loadData()
    data, target_data = loadData()
    data = preprocessing(data)

    #trainDF is the issue here, gotta figure out why we are getting out of index

    trainDF = dataset_prep()
    trainDF = nlp_features(trainDF)

    train_x, valid_x, train_y, valid_y = split_section(trainDF, 0)
    accuracy, accuracy1, accuracy2, accuracy3, accuracy4 = count_vector(
        train_x, valid_x, train_y, valid_y)
    accuracy5, accuracy6, accuracy7, accuracy8, accuracy9, accuracy10, accuracy11, accuracy12, accuracy13, accuracy14 = tf_idf_vectors(
        train_x, valid_x, train_y, valid_y)
    output_file('test_results/10author_result1.txt', accuracy, accuracy1,
                accuracy2, accuracy3, accuracy4, accuracy5, accuracy6,
                accuracy7, accuracy8, accuracy9, accuracy10, accuracy11,
                accuracy12, accuracy13, accuracy14)
    # Create new column representing mean sqft of above area for each house in each (zipcode,grade) group
    df['mean_sqft_sqft_above'] = df.groupby(
        by=['zipcode', 'grade'])['sqft_above'].transform('mean')

    # Create new column representing mean grade for each house in each zipcode group
    df['mean_grade'] = df.groupby(by=['zipcode'])['grade'].transform('mean')

    df_updated = df

    return df_updated


# In[ ]:

# Preprocess train and test datasets
df_train = preprocessing('train.csv')

# Use log transform to make the columns less skewed (to meet the assumptions of inferential statistics)
trans_columns = ['sqft_living', 'sqft_living15', 'sqft_lot', 'sqft_lot15']
df_train = log_transform(df_train, trans_columns + ['price'])

# Preprocess train and test datasets
df_train = feature_generator(df_train, 47.36217, -122.20069)

# Additional step in preprocessing train data
# Delete outliers in number of bedrooms from train data
df_train = df_train[df_train['bedrooms'] < 11]

# In[ ]:

df_train.head()
Ejemplo n.º 31
0
def preprocessing(notes):
    namecat = sklearn.preprocessing.LabelEncoder().fit_transform(
        [n["name"] for n in notes])
    onehotencoded = sklearn.preprocessing.OneHotEncoder().fit_transform(
        [[n] for n in namecat]).todense()

    numerator = [n["length"].numerator for n in notes]
    denominator = [n["length"].denominator for n in notes]

    return numpy.append(numpy.append(onehotencoded, [[n] for n in numerator],
                                     axis=1), [[d] for d in denominator],
                        axis=1)


data = preprocessing(notes).getA()
scaler = sklearn.preprocessing.MinMaxScaler()
scaled = scaler.fit_transform(data)
print(scaled.shape)

sequenceLength = 20
n_features = len(data[0])

input = numpy.zeros(
    (len(scaled) - sequenceLength, sequenceLength, len(scaled[0])))
output = numpy.zeros((len(scaled) - sequenceLength, len(scaled[0])))
for i in range(0, len(scaled) - sequenceLength):
    for j in range(sequenceLength):
        input[i, j] = scaled[i + j]
    output[i] = scaled[i + sequenceLength]
print(input.shape)
Ejemplo n.º 32
0
scaler = MinMaxScaler()

gafeat = GenAlFeaturesSelector(n_pop=n_population,max_gen=max_generation,
                               desired_fit=desired_fitness,
                               scaler = scaler,clf=clf)

#for each time window perform analysis

for t_window in range(1,5):
    time_window = t_window

    tmp_results = list()

    bs = BakSys(threeclass=False,seconds = time_window)

    subj1 = preprocessing(subj1_raw,'subject 1 data',256,bs,n_class = 2,
                          time_window = time_window)
    subj2 = preprocessing(subj2_raw,'subject 2 data',256,bs,n_class = 2,
                          time_window = time_window)
    subj3 = preprocessing(subj3_raw,'subject 3 data',256,bs,
                          time_window = time_window)
    subj4 = preprocessing(subj4_raw,'subject 4 data',256,bs,
                          time_window = time_window)
    overall = (np.vstack([subj1[0],subj2[0],subj3[0],subj4[0]]),
               np.hstack([subj1[1],subj2[1],subj3[1],subj4[1]]),
               subj1[2],'overall data')

     for n in [subj1,subj2,subj3,subj4,overall]:
    #for n in [subj1,subj2]:
        tmp = subroutine(n,time_window,gafeat)
        tmp_results.append(tmp)
Ejemplo n.º 33
0
    return ficm


#concatenates the matrixes and does classification
def concat(f1, f2, f3, f4, f5):
    f12 = np.concatenate((f1, f2), axis=1)
    f123 = np.concatenate((f1, f2, f3), axis=1)
    f1234 = np.concatenate((f1, f2, f3, f4), axis=1)
    f12345 = np.concatenate((f1, f2, f3, f4, f5), axis=1)
    print(f12.shape)
    print(f123.shape)
    print(f1234.shape)
    print(f12345.shape)
    classification(f12)
    classification(f123)
    classification(f1234)
    classification(f12345)


file_names = [
    "amazon_cells_labelled.txt", "imdb_labelled.txt", "yelp_labelled.txt"
]
for fname in file_names:
    sentences, lables = preprocessing(fname)
    f1 = onegrams(sentences, lables)
    f2 = bigrams(sentences, lables)
    f3 = trigrams(sentences, lables)
    f4 = fgrams(sentences, lables)
    f5 = fivegrams(sentences, lables)
    concat(f1, f2, f3, f4, f5)