Python DataRetriever.getDataClass 예제들, MLAlgorithms.Utils.DataRetriever.DataRetriever.getDataClass Python 예제들

예제 #1

0

파일 보기

파일: TestNaiveBayes.py 프로젝트: lineranch/CSCI-447

    def test_q_calculation(self):
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")

        naiveBayes = NaiveBayes(dataRetriever.getDataSet(),
                                dataRetriever.getDataClass())

        #seperatedByClass = naiveBayes.calculateQ()

        #print(seperatedByClass)

        self.assertEqual(
            dataRetriever.getDataMenu(),
            ["breastCancer", "glass", "iris", "soybeanSmall", "vote"],
            "should return list of data sets")

예제 #2

0

파일 보기

파일: TestKMediods.py 프로젝트: lineranch/CSCI-447

    def testKMeans(self):
        data = DataRetriever("../Datasets/metadata.json")
        data.retrieveData("computerHardware")

        kValue = 15
        t = Timer()
        t.start()
        mediods = KMediods(data.getDataSet(), data.getDataClass(),
                           data.getDescreteAttributes(),
                           data.getContinuousAttributes(),
                           data.getPredictionType(), kValue, 100)

        t.stop()
        print(f"Time: {t}")
        print(mediods)
        mediods.to_csv('kmedoids.csv', index=False)

예제 #3

0

파일 보기

파일: KMedoidsClusteringCreateCSV.py 프로젝트: lineranch/CSCI-447

maxItter = 100
kValue = 78

# These are only used for image segmentation and abalone
# frac = .25
# random_state = 69
# kValue = m.floor(frac * kValue)

dataSetUnNormalized = data.getDataSet()
# dataSetUnNormalized[data.getDataClass()] = np.log(dataSetUnNormalized[data.getDataClass()] + 0.001)  // This is for Forest Fires

sn = StandardNormalizer(dataSetUnNormalized[data.getContinuousAttributes()])
dataSetUnNormalized[data.getContinuousAttributes()] = sn.train_fit()

dataSetNormalized = dataSetUnNormalized

# dataSetNormalized = dataSetNormalized.sample(frac=frac, random_state=random_state)
# dataSetNormalized = dataSetNormalized.reset_index()

# dataSetNormalized = dataSetNormalized.drop(["idNumber"], axis=1) #// For Glass

medoids = KMediods(dataSetNormalized, data.getDataClass(),
                   data.getDescreteAttributes(),
                   data.getContinuousAttributes(), data.getPredictionType(),
                   kValue, maxItter)

medoids.to_csv('./CSVOutput/' + "normalized" + dataSetName +
               'MedoidsClustered.csv',
               index=False)
print(f"CSV for " + dataSetName + " has been created!")

예제 #4

0

파일 보기

# This line is used to normalize the data for Forest Fires
# dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1)
maxIter = 1
learning_rate = 1e-3
batch_size = 0.01

metrics = []
fold = 0


test_set = test_set.reset_index(drop=True)
train_set = train_set.reset_index(drop=True)
ohe = OneHotEncoder()
discrete_attr = dataRetriever.getDescreteAttributes()
if dataRetriever.getDataClass() in discrete_attr:
    discrete_attr.remove(dataRetriever.getDataClass())

train_set = ohe.train_fit(train_set, discrete_attr)
test_set = ohe.fit(test_set)

#  Normalize Data
sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()])
train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit()
test_set[dataRetriever.getContinuousAttributes()] = sn.fit(test_set[dataRetriever.getContinuousAttributes()])

# Train network and change architecture in respect to data set
nn = NeuralNetwork(train_set, 2, [6,16], dataRetriever.getPredictionType(), dataRetriever.getDataClass())
fitness_matrix, average_fitness = nn._particle_swarm_optimize(70, max_iter=500)

예제 #5

0

파일 보기

파일: KMeansClusteringCreateCSV.py 프로젝트: lineranch/CSCI-447

maxItter = 100
kValue = 1330

# These are only used for image segmentation and abalone
frac = .25
random_state = 69
kValue = m.floor(frac * kValue)

dataSetUnNormalized = data.getDataSet()
# dataSetUnNormalized[data.getDataClass()] = np.log(dataSetUnNormalized[data.getDataClass()] + 0.001)  // This is for Forest Fires

sn = StandardNormalizer(dataSetUnNormalized[data.getContinuousAttributes()])
dataSetUnNormalized[data.getContinuousAttributes()] = sn.train_fit()

dataSetNormalized = dataSetUnNormalized

dataSetNormalized = dataSetNormalized.sample(frac=frac,
                                             random_state=random_state)
dataSetNormalized = dataSetNormalized.reset_index()
# dataSetNormalized = dataSetNormalized.drop(["idNumber"], axis=1) // For Glass

centroids = KMeans(dataSetNormalized, data.getDataClass(),
                   data.getDescreteAttributes(),
                   data.getContinuousAttributes(), data.getPredictionType(),
                   kValue, maxItter)

centroids.to_csv('./CSVOutput/' + "normalized" + dataSetName +
                 'KMeansClustered.csv',
                 index=False)
print(f"CSV for " + dataSetName + " has been created!")

예제 #6

0

파일 보기

파일: GA_driver.py 프로젝트: lineranch/CSCI-447

def run_driver(current_data_set,
               mutation_rate=0.5,
               maxIter=1000,
               batch_size=0.6,
               population_size=110,
               network_architecture=[15],
               pb_actor=None):
    cost_func = {
        "breastCancer": "bin_cross",
        "glass": "log_cosh",
        "soybeanSmall": "log_cosh",
        "abalone": "log_cosh",
        "forestFires": "log_cosh",
        "computerHardware": "log_cosh"
    }

    title_text = r""" 
       ______                    __   _          ___     __                     _  __   __                    
      / ____/___   ____   ___   / /_ (_)_____   /   /   / /____ _ ____   _____ (_)/ /_ / /_   ____ ___   _____
     / / __ / _ \ / __ \ / _ \ / __// // ___/  / /| /  / // __ `// __ \ / ___// // __// __ \ / __ `__ \ / ___/
    / /_/ //  __// / / //  __// /_ / // /__   / ___ / / // /_/ // /_/ // /   / // /_ / / / // / / / / /(__  ) 
    \____/ \___//_/ /_/ \___/ \__//_/ \___/  /_/  |_//_/ \__, / \____//_/   /_/ \__//_/ /_//_/ /_/ /_//____/  
                                                        /____/                                                
    """

    output_json = {}

    # ====================== Adjustable Variables ==============================
    # current_data_set = "abalone"
    # mutation_rate = .5
    # maxIter = 10
    # batch_size = .6
    # population_size = 110

    # network_architecture = []
    # ===========================================================================

    output_json["parameters"] = {
        "mutation_rate": mutation_rate,
        "population_size": population_size,
        "network_architecture": network_architecture,
        "cost_func": cost_func[current_data_set],
        "maxIter": maxIter,
        "batch_size": batch_size
    }

    # ================ Data pre-processing =================================================
    dataRetriever = DataRetriever("../../Datasets/metadata.json")
    dataRetriever.retrieveData(current_data_set)
    dataset = dataRetriever.getDataSet().dropna()

    discrete_attr = dataRetriever.getDescreteAttributes()
    cont_attributes = dataRetriever.getContinuousAttributes()
    # This line is used to normalize the data for Forest Fires
    if current_data_set == "forestFires":
        discrete_attr.remove('month')
        discrete_attr.remove('day')
        dataset['month'] = (pd.to_datetime(dataset.month,
                                           format='%b').dt.month) - 1
        dataset["day"] = dataset['day'].apply(
            lambda x: list(calendar.day_abbr).index(x.capitalize()))
        dataset["month_sin"] = np.sin(dataset['month'])
        dataset["month_cos"] = np.sin(dataset['month'])

        dataset["day_sin"] = np.sin(dataset['day'])
        dataset["day_cos"] = np.sin(dataset['day'])
        dataset = dataset.drop('day', axis=1)
        dataset = dataset.drop('month', axis=1)
        cont_attributes.append('month_sin')
        cont_attributes.append('month_cos')
        cont_attributes.append('day_sin')
        cont_attributes.append('day_cos')

        dataset[dataRetriever.getDataClass()] = np.log(
            dataset[dataRetriever.getDataClass()] + 0.000001)
    elif current_data_set == "computerHardware":
        discrete_attr.remove('venderName')
        discrete_attr.remove('modelName')
        dataset = dataset.drop('venderName', axis=1)
        dataset = dataset.drop('modelName', axis=1)

    dataset = dataset.reset_index(drop=True)

    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    # ======================= Train Neural Network ================
    print(title_text)
    fold = 0
    metrics = []

    for test_set, train_set in KFolds(dataset, 10):
        fold += 1
        fitness_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_fitness.csv"
        output_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_output.csv"

        metrics.append(
            multiprocess_func.remote(test_set,
                                     train_set,
                                     fold,
                                     fitness_file,
                                     output_file,
                                     dataRetriever,
                                     cost_func[current_data_set],
                                     current_data_set,
                                     mutation_rate,
                                     maxIter,
                                     batch_size,
                                     population_size,
                                     network_architecture,
                                     pb_actor=None))

    metrics = ray.get(metrics)
    print(metrics)
    print("Average Performance: ", np.asarray(metrics).mean())
    output_json["Metrics"] = metrics
    output_json["Average"] = np.asarray(metrics, dtype=np.float64).mean()
    output_json["Std"] = np.asarray(metrics, dtype=np.float64).std()

    with open(
            f"../DataDump/GA_{current_data_set}_layer{len(network_architecture)}.json",
            'w') as f:
        json.dump(output_json, f, indent=4)

예제 #7

0

파일 보기

from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer
from MLAlgorithms.Utils.DataRetriever import DataRetriever
from MLAlgorithms.Utils.ClassifierAnalyzer import ClassifierAnalyzer

import numpy as np
import json

dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData("vote")
data = dataRetriever.getDataSet()
data = data.dropna()
data = data.sample(frac=1.0, random_state=93)
data = data.reset_index(drop=True)
# data = data.drop('idNumber', axis=1)

class_col = dataRetriever.getDataClass()
# data[class_col] = np.log(data[class_col] + 0.001)

contAttr = dataRetriever.getContinuousAttributes()
discAttr = dataRetriever.getDescreteAttributes()
predictionType = dataRetriever.getPredictionType()

output_json = {}
iter_num = 0

for test, train in KFolds(data, 5, stratisfied=True, class_col=class_col):

    #KFolds doesn't have the capability of returning a validate set
    #K is set to desired k/2 and the validate set is half of the test set

    sn = StandardNormalizer(train[contAttr])

예제 #8

0

파일 보기

def network_tuner(*nodes_per_hidden_layer):
    """
    This function is used to calcuate the optimal network architecture
    The user should input the dataset they would like to operate with and change the performance metric in accordance to the data set type IE regression or classification 

    """
    
    MSEs = []

    bestNetwork = {}
    learning_rate = 0.0001
    maxItter = 500
    batch_size = .5

    dataRetriever = DataRetriever("../Datasets/metadata.json")
    dataRetriever.retrieveData("glass")
    dataset = dataRetriever.getDataSet().dropna()


    dataset = dataset.reset_index(drop=True)

    # This line is used to normalize the data for Forest Fires
    # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1)

    dataset[dataRetriever.getContinuousAttributes()] = (dataset[dataRetriever.getContinuousAttributes()]-dataset[dataRetriever.getContinuousAttributes()].mean())/dataset[dataRetriever.getContinuousAttributes()].std()

    test_set = dataset.sample(frac=0.1, random_state=69)
    train_set = dataset.drop(test_set.index)
    test_set = test_set.reset_index(drop=True)
    train_set = train_set.reset_index(drop=True)

    ohe = OneHotEncoder()
    discrete_attr = dataRetriever.getDescreteAttributes()
    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    datasetEncoded = ohe.train_fit(train_set, dataRetriever.getDescreteAttributes())
    testEncoded = ohe.fit(test_set)


    output = None
    nn = NeuralNetwork(datasetEncoded, 0, [], dataRetriever.getPredictionType(), dataRetriever.getDataClass())
    for i in range(maxItter):
        # We don't call an inital feedforward because backpropagate starts with a feedforward call
        # batch_size represents the number of data points per batch
        output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size)


    final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1))
    output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True)
    actual = testEncoded[dataRetriever.getDataClass()]


    ## ===================== Classification =================
    correct = 0
    acc = 0
    for i, row in enumerate(final):
        if row == actual.iloc[i]: correct += 1


    # final = final.reshape(final.shape[0])

    # MSE = ((actual-final)**2).mean()
    # MSEs.append(MSE)
    bestNetwork['network'] = nn
    bestNetwork['acc'] = acc
    bestNetwork['arc'] = [0]
    # # ============================================

    # # ============ Compare Acc to Most Common Class

    values = test_set[dataRetriever.getDataClass()].value_counts()


    # USED FOR CLASSIFICATION
    # print(f'Accuracy: {acc}')
    # print(f'Max Class Prior: {values.max()/values.sum()}')
    # print(f"Class Distribution:\n{values}")
    # print("Final: ", final)
    # print("Actual: ", list(actual))
    # print()



    numOfLayer = len(nodes_per_hidden_layer)
    print("Number of Hidden Layers: ", numOfLayer)
    for layer in range(numOfLayer):
        print(f"Layer Number: {layer + 1}")
        combinations = list(itertools.product(*nodes_per_hidden_layer[:layer+1]))

        for combo in combinations:

            output = None
            print("Node Combination: ",list(combo))
            print(combo)

            nn = NeuralNetwork(datasetEncoded, layer, list(combo), dataRetriever.getPredictionType(), dataRetriever.getDataClass())
            for i in range(maxItter):
                # We don't call an inital feedforward because backpropagate starts with a feedforward call
                # batch_size represents the number of data points per batch
                output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size)

            final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1))
            output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True)
            actual = testEncoded[dataRetriever.getDataClass()]

            ## ===================== Classification =================
            correct = 0
            acc = 0
            for i, row in enumerate(final):
                if row == actual.iloc[i]: correct += 1

            acc = correct/len(test_set)
            # # # ============================================

            # # # ============ Compare Acc to Most Common Class

            values = test_set[dataRetriever.getDataClass()].value_counts()

            # USED FOR CLASSIFICATION
            # print(f'Accuracy: {acc}')
            # print(f'Max Class Prior: {values.max()/values.sum()}')
            # # print(f"Class Distribution:\n{values}")
            # print("Final: ", final)
            # print("Actual: ", list(actual))
            # print()

            if acc > bestNetwork['acc']:
                bestNetwork['network'] = nn
                bestNetwork['acc'] = acc
                bestNetwork['arc'] = combo

            # final = final.reshape(final.shape[0])

            # MSE = ((actual-final)**2).mean()
            # MSEs.append(MSE)
            # if MSE < bestNetwork['acc']:
            #     bestNetwork['network'] = nn
            #     bestNetwork['acc'] = MSE
            #     bestNetwork['arc'] = combo

            



    return bestNetwork#, MSEs

예제 #9

0

파일 보기

파일: classificationDriver.py 프로젝트: lineranch/CSCI-447

learning_rate = 1e-3
batch_size = 0.01

metrics = []
fold = 0

# Ten-Fold Cross Validation
for test_set, train_set in KFolds(dataset, 10):
    fold += 1
    print("Fold Num: ", fold)
    # Encode Data
    test_set = test_set.reset_index(drop=True)
    train_set = train_set.reset_index(drop=True)
    ohe = OneHotEncoder()
    discrete_attr = dataRetriever.getDescreteAttributes()
    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    train_set = ohe.train_fit(train_set, discrete_attr)
    test_set = ohe.fit(test_set)

    #  Normalize Data
    sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()])
    train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit()
    test_set[dataRetriever.getContinuousAttributes()] = sn.fit(
        test_set[dataRetriever.getContinuousAttributes()])

    # Train network and change architecture in respect to data set
    nn = NeuralNetwork(train_set, 2, [2, 2], dataRetriever.getPredictionType(),
                       dataRetriever.getDataClass())
    nn.train(maxIter, learning_rate, batch_size)

예제 #10

0

파일 보기

파일: driver.py 프로젝트: lineranch/CSCI-447

    print("The Percent of Correct Predictions is {t}%".format(
        t=round((t * 100 / len(answers)), 1)))
    print("The Percent of Incorrect Predictions is {f}%\n".format(
        f=round((f * 100 / len(answers)), 1)))


dataRetriever = DataRetriever("../Datasets/metadata.json")

################################################ Un-Shuffled Data ################################################

# This first for loop performs the NaiveBayes algorithm for un-shuffled data
jsonResults1 = {}
for dataSet in dataRetriever.getDataMenu():
    dataRetriever.retrieveData(dataSet)
    dataClass = dataRetriever.getDataClass()
    retrievedData = dataRetriever.getDataSet()

    numOfClassValues = len(
        retrievedData[dataRetriever.getDataClass()].unique())
    method = "macro"
    foldNum = 1

    jsonResults1[dataSet] = {}

    print(f"PRINTING RESULTS FOR THE CONTROL DATASET {dataSet}")
    for train, test in KFolds(retrievedData, 10):

        trainBin = BinDiscretizer(
            train[dataRetriever.getContinuousAttributes()], multi=True)

예제 #11

0

파일 보기

파일: ValueDifferenceMetric.py 프로젝트: lineranch/CSCI-447

if __name__ == "__main__":
    dataRetriever = DataRetriever("../Datasets/metadata.json")
    dataRetriever.retrieveData("computerHardware")
    data = dataRetriever.getDataSet()
    data = data.dropna()
    data = data.reset_index(drop=True)[[
        "venderName", "modelName", "myct", "mmin", "mmax", "cach", "chmin",
        "chmax", "prp", "erp"
    ]]

    test = data.sample(frac=0.2)
    train = data.drop(test.index)

    test = test.reset_index(drop=True)
    train = train.reset_index(drop=True)

    VDM = ValueDifferenceMetric(
        data,
        unknown_col=dataRetriever.getDataClass(),
        prediction_type=dataRetriever.getPredictionType())

    start = time.time()
    VDM.train()
    print(f"Training took: {time.time() - start} seconds")

    start = time.time()
    VDM.calc_distance_matrix(data["mmin"], data["mmin"])
    print(f"Matrix took: {time.time() - start} seconds")

    # print(KNN.get_neighbors([5,10]))
    # print(KNN.test(augmented=True))

예제 #12

0

파일 보기

파일: ga_testing_driver.py 프로젝트: lineranch/CSCI-447

mutation_rate = .5
maxItter = 1000
batch_size = .6
population_size = 110
# ===========================================================================

# ================ Data pre-processing =================================================
dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData(current_data_set)
dataset = dataRetriever.getDataSet().dropna()

discrete_attr = dataRetriever.getDescreteAttributes()
cont_attributes = dataRetriever.getContinuousAttributes()
# This line is used to normalize the data for Forest Fires
if current_data_set == "forestFires":
    zeros = dataset[dataset[dataRetriever.getDataClass()] < 1].index
    print(len(zeros) / len(dataset))
    dataset = dataset.drop(zeros)
    discrete_attr.remove('month')
    discrete_attr.remove('day')

    dataset['month'] = (pd.to_datetime(dataset.month,
                                       format='%b').dt.month) - 1
    dataset["day"] = dataset['day'].apply(
        lambda x: list(calendar.day_abbr).index(x.capitalize()))
    dataset["month_sin"] = np.sin(dataset['month'])
    dataset["month_cos"] = np.sin(dataset['month'])

    dataset["day_sin"] = np.sin(dataset['day'])
    dataset["day_cos"] = np.sin(dataset['day'])
    dataset = dataset.drop('day', axis=1)

예제 #13

0

파일 보기

파일: regressionDriver.py 프로젝트: lineranch/CSCI-447

learning_rate = 1e-3
batch_size = 0.01

metrics = []
fold = 0

# Ten-Fold Cross Validation
for test_set, train_set in KFolds(dataset, 10):
    fold += 1
    print("Fold Num: ", fold)
    # Encode Data
    test_set = test_set.reset_index(drop=True)
    train_set = train_set.reset_index(drop=True)
    ohe = OneHotEncoder()
    discrete_attr = dataRetriever.getDescreteAttributes()
    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    train_set = ohe.train_fit(train_set, discrete_attr)
    test_set = ohe.fit(test_set)

    #  Normalize Data
    sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()])
    train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit()
    test_set[dataRetriever.getContinuousAttributes()] = sn.fit(
        test_set[dataRetriever.getContinuousAttributes()])

    # Train network and change architecture in respect to data set
    nn = NeuralNetwork(train_set, 0, [], dataRetriever.getPredictionType(),
                       dataRetriever.getDataClass())
    nn.train(maxIter, learning_rate, batch_size)