Python PreprocessData 예제들, PreprocessData Python 예제들

예제 #1

0

파일 보기

def main():
    num_cluster = int(input("Enter Number of CLusters of data:  "))
    num_attr = int(input("Enter Number of attributes of the data:  "))
    num_rows = [
        int(x) for x in input(
            "Enter the number of rows of each Cluster with a 3space :  ").
        split()
    ]
    mu = data.genRandomList(num_attr, 0, 10)
    sigma = data.genRandomList(num_attr, 0, 10)
    df = data.genrateDataVal(mu, sigma, num_attr, num_rows, num_cluster)
    data.splitData(df, 0.5)
    pre.removeOutliers()
    pre.pca()  # Print the Variance Ratio of top   3 attributes of the PCA

예제 #2

0

파일 보기

파일: TestRun.py 프로젝트: JodesL/NHLStats

def makeOneFold(nb_folds):
    # Returns one fold from the cross-validation training set
    # Note: has to create the whole cross-validation set (could be improved)
    data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, nb_folds)
    rand_fold = random.randint(0, nb_folds-1)  # Pick a random fold to test

    np.random.shuffle(data_trains[rand_fold])  # shuffles training examples

    x_train = data_trains[rand_fold][:, 1:]
    y_train = data_trains[rand_fold][:, 0]

    x_test = data_tests[rand_fold][:, 1:]
    y_test = data_tests[rand_fold][:, 0]

    return x_train, y_train, x_test, y_test

예제 #3

0

파일 보기

파일: RainfallDecisionTree.py 프로젝트: andrewliang25/fianl_project_of_Introduction_to_Big_Data_Analytics

def main():
    place_weather = PreprocessData.PreprocessData()
    feature_columns = [
        'ObsTime', 'StnPres', 'SeaPres', 'Temperature', 'Td dew point', 'RH',
        'Precp', 'PrecpHour', 'SunShine', 'SunShineRate', 'VisbMean', 'EvapA',
        'Cloud Amount'
    ]
    weather_X = place_weather[feature_columns]
    weather_Y = place_weather['NextDayPrecp']
    train_X, test_X, train_Y, test_Y = train_test_split(
        weather_X, weather_Y.astype('int'), test_size=0.3)

    rainfall_classifier = CreateClassifier(train_X, train_Y)

    test_Y_predicted = rainfall_classifier.predict(test_X)
    accuracy = metrics.accuracy_score(test_Y, test_Y_predicted)
    VisualizingDecisionTree.Visualize(rainfall_classifier, feature_columns,
                                      weather_Y.to_string())
    print(accuracy)

예제 #4

0

파일 보기

파일: TestRun.py 프로젝트: JodesL/NHLStats

def sequentialValidate(net, start=0.5, step=1, iterations=1000, learning_rate=0.01, grad_decay=0.9, epsilon=0.000001, adadelta=False):
    # Cross-validation procedure for time series data
    # Trains on the first 'start' fraction of examples and predicts the next one
    # Adds 'step' examples to training set and tests on the next example, repeat until all the examples have been used

    data = pp.preprocessing_final(2012, 2014, export=False)[0]
    x_data = data[:, 1:]
    y_data = data[:, 0]

    min_errs = []
    test_errs = []
    train_errs = []
    train_class_errs = []
    min_class_errs = []

    nb_examples = int(start * len(data))
    nb_runs = 0
    print(len(x_data[nb_examples]))
    while nb_examples < len(data):
        net.reset()
        temp = net.test(x_data[:nb_examples, :], y_data[:nb_examples], iterations, learning_rate, grad_decay, epsilon, adadelta, X_test=x_data[nb_examples:nb_examples+20, :], y_test=y_data[nb_examples:nb_examples+20])

        min_errs.append(temp[0])
        test_errs.append(temp[1])
        train_errs.append(temp[2])
        train_class_errs.append(temp[3])
        min_class_errs.append(temp[4])

        nb_examples += step
        nb_runs += 1

    print("\n----------")
    print(net, "\tNb runs:", nb_runs)
    print("Avg min:", sum(min_errs)/nb_runs, "\t\t\t", min_errs)
    print("Avg final test:", sum(test_errs)/nb_runs, "\t\t\t", test_errs)
    print("Avg final train:", sum(train_errs)/nb_runs, "\t\t\t", train_errs)
    print("Avg final class ", sum(train_class_errs)/nb_runs, "\t\t\t", train_class_errs)
    print("Avg min class ", sum(min_class_errs)/nb_runs, "\t\t\t", min_class_errs)

예제 #5

0

파일 보기

# -*- coding: utf-8 -*-
"""
Created on Sun May 26 23:34:08 2019

@author: user
"""
import numpy as np
import matplotlib.pyplot as plt
import PreprocessData as pre
import pandas as pd

#PREPROCESS
solarRaw, windRaw, demandRaw, allRaw2016 = pre.importData()
solar, wind, agg, demand = pre.preprocessData(solarRaw, windRaw, demandRaw,
                                              allRaw2016)
df0 = pd.DataFrame(
    data={
        "solar": solar["0"],
        "wind": wind["0"],
        "agg": agg["0"],
        "demand": demand["0"],
        "solar25": solar["0"] * 0.25,
        "wind75": wind["0"] * 0.75
    })

#%% PROCESS
print("START monitoring design")

#initialize
titles = [
    "(a) Solar PV 100MWp Down", "(b) Wind 100MWp Down",

예제 #6

0

파일 보기

def clear(key_params=[]):
    X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
    X = PreprocessData.prepare_data(X, mode='save', key_features=key_params)
    return X, y

예제 #7

0

파일 보기

X, y = clear()
svm_clf = SVC(kernel='linear')
svm_clf.fit(X, y)

# In[3]:

# test feature transformation

scores = []

X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
scores.append(check_score(X, y, svm_clf))

X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
scores.append(check_score(PreprocessData.prepare_features(X), y, svm_clf))

X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
scores.append(
    check_score(PreprocessData.prepare_data(X, mode='save'), y, svm_clf))

scores

# In[6]:

# test feature importances

from sklearn.feature_selection import RFE
scores = []

for i in xrange(18):

예제 #8

0

파일 보기

파일: SendDataToModal.py 프로젝트: anmolg13/Pickup-Point-Classification

def lambda_handler(event,context):
    store_id = int(event['data'])
    item = DynamoDB.GetDataFromDB(store_id)
    data = PreprocessData.PreprocessReceivedData(item)
    return data

예제 #9

0

파일 보기

파일: Comparisons.py 프로젝트: JodesL/NHLStats

import PreprocessData as pp
import TestRun

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

import matplotlib as plt

# This file is used to test other machine learning algorithms

if __name__ == '__main__':
    pass
    data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, 9)
    print("Tests")
    errs = []
    for i in range(9):

        x_train = data_trains[i][:, 1:]
        y_train = data_trains[i][:, 0]

        x_test = data_tests[i][:, 1:]
        y_test = data_tests[i][:, 0]

        # logistic regression
        reg = LogisticRegression()
        reg.fit(x_train, y_train)
        print("Error:", reg.score(x_test, y_test))

        # support vector machine

예제 #10

0

파일 보기

파일: TestRun.py 프로젝트: JodesL/NHLStats

def crossValidate(net, nb_folds, iterations=1000, learning_rate=0.01, grad_decay=0.9, epsilon=0.000001, adadelta=False):
    # Splits the data into nb_folds batches using each batch as a testing set in turn and rest as the training set

    ######## Need to fix: how to train on multiple years at once?
    data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, nb_folds)
    for i in range(nb_folds):
        np.random.shuffle(data_trains[i])  # shuffles training examples

    min_errs = []
    test_errs = []
    train_errs = []

    nb_buckets = 5  # Could make this a parameter
    freq_probs_test = [0] * nb_buckets
    freq_wins_test = [0] * nb_buckets
    freq_probs_train = [0] * nb_buckets
    freq_wins_train = [0] * nb_buckets

    for i in range(nb_folds):
        print("--- Fold " + str(i+1) + " ---")
        start = time.clock()

        net.reset()
        # Make test and training sets
        x_train = data_trains[i][:, 1:]
        y_train = data_trains[i][:, 0]

        x_test = data_tests[i][:, 1:]
        y_test = data_tests[i][:, 0]

        temp = net.test(x_train, y_train, iterations, learning_rate, grad_decay, epsilon, adadelta, X_test=x_test, y_test=y_test)

        min_errs.append(temp[0])
        test_errs.append(temp[1])
        train_errs.append(temp[2])

        freqs = net.testProbBuckets(x_train, y_train, nb_buckets=nb_buckets, X_test=x_test, y_test=y_test)
        # Aggregates the prob buckets from each fold together
        freq_probs_test = list(map(add, freq_probs_test, freqs[0]))
        freq_wins_test = list(map(add, freq_wins_test, freqs[1]))
        freq_probs_train = list(map(add, freq_probs_train, freqs[2]))
        freq_wins_train = list(map(add, freq_wins_train, freqs[3]))


        print("Time:", time.clock() - start)

    print("\n----------")
    print(net, "\tNb folds:", nb_folds)
    print("Avg min:", sum(min_errs)/nb_folds, "\t\t\t", min_errs)
    print("Avg final test:", sum(test_errs)/nb_folds, "\t\t\t", test_errs)
    print("Avg final train:", sum(train_errs)/nb_folds, "\t\t\t", train_errs)

    probs_test = [freq_wins_test[i]/ freq_probs_test[i] if freq_probs_test[i] != 0 else -1 for i in range(nb_buckets)]
    probs_train = [freq_wins_train[i]/ freq_probs_train[i] if freq_probs_train[i] != 0 else -1 for i in range(nb_buckets)]

    print("Total freq test:")
    print(freq_probs_test)
    print(freq_wins_test)
    print(["{0:.2f}".format(x) for x in probs_test])

    print("Total freq train:")
    print(freq_probs_train)
    print(freq_wins_train)
    print(["{0:.2f}".format(x) for x in probs_train])

    # Returns average min test error
    return sum(min_errs)/nb_folds

예제 #11

0

파일 보기

파일: run.py 프로젝트: AlbusLepus/FD_antipattern_static_analyzer

def classify(class_prec, clf):
    import PreprocessData
    return clf.predict(PreprocessData.prepare_data([class_prec.get_features()
                                                    ]))[0]