Esempio n. 1
0
def multiprocess_func(test_set, train_set, fold, fitness_file, output_file, dataRetriever, cost_func, current_data_set, cross_over_prob=0.7,mutation_rate=0.3, maxIter=1000, batch_size=0.6, population_size=110, network_architecture=[15], pb_actor=None):
    
    print("=========================")
    print("Fold Num: ", fold)
    # Encode Data
    test_set = test_set.reset_index(drop=True)
    train_set = train_set.reset_index(drop=True)
    ohe = OneHotEncoder()
    discrete_attr = dataRetriever.getDescreteAttributes()
    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    train_set = ohe.train_fit(train_set, discrete_attr)
    test_set = ohe.fit(test_set)

    #  Normalize Data
    sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()])
    train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit()
    test_set[dataRetriever.getContinuousAttributes()] = sn.fit(test_set[dataRetriever.getContinuousAttributes()])

    # Train network and change architecture in respect to data set
    nn = NeuralNetwork(train_set, len(network_architecture), network_architecture, dataRetriever.getPredictionType(), dataRetriever.getDataClass())
    
    fitnesses = nn.differential_evolution(population_size, maxIter, batch_size, mutation_rate, cross_over_prob, cost_func)
    final = nn.test(test_set.drop(dataRetriever.getDataClass(), axis=1))
    output = nn._feed_forward(test_set.drop(dataRetriever.getDataClass(), axis=1), testing=True)
    actual = test_set[dataRetriever.getDataClass()]

    fitness_pd = pd.DataFrame(fitnesses,columns=["Max_Weight", "Min_Weight", "Mean_Fitness"])
    fitness_pd.to_csv(fitness_file, index=False)

    print("Fold Performance:")
    if dataRetriever.getPredictionType() == "classification":
    # ## ===================== Classification =================
        correct = 0
        for i, row in enumerate(final):
            if row == actual.iloc[i]: correct += 1

        acc = correct/len(test_set)

        print(f"Accuracy: {acc}")
        output_pd = pd.DataFrame({'Truth':actual.to_list(), 'Predicted':final})
    
        output_pd.to_csv(output_file, index=False)
        return acc
    else:
        output = output.reshape(output.shape[0])
        
        res = actual-output
        r2 = 1-((res**2).sum()/(((actual-actual.mean())**2).sum()))
        print(f"R2: {r2}")
        output_pd = pd.DataFrame({'Truth':actual.to_list(), 'Predicted':output})
    
        output_pd.to_csv(output_file, index=False)
        return float(r2)
Esempio n. 2
0
    def test_untrained(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        breastCancer = dataRetriever.retrieveData("breastCancer")
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        rn = RangeNormalizer(breastCancer[continousAttributes])
        sn = StandardNormalizer(breastCancer[continousAttributes])

        #Test range normalizer
        with self.assertRaises(UntrainedUtilityError):
            rn.fit(breastCancer[continousAttributes])

        #Test standard normalizer
        with self.assertRaises(UntrainedUtilityError):
            sn.fit(breastCancer[continousAttributes])
print(f"Creating CSV for {dataSetName}")
data.retrieveData(dataSetName)

maxItter = 100
kValue = 78

# These are only used for image segmentation and abalone
# frac = .25
# random_state = 69
# kValue = m.floor(frac * kValue)

dataSetUnNormalized = data.getDataSet()
# dataSetUnNormalized[data.getDataClass()] = np.log(dataSetUnNormalized[data.getDataClass()] + 0.001)  // This is for Forest Fires

sn = StandardNormalizer(dataSetUnNormalized[data.getContinuousAttributes()])
dataSetUnNormalized[data.getContinuousAttributes()] = sn.train_fit()

dataSetNormalized = dataSetUnNormalized

# dataSetNormalized = dataSetNormalized.sample(frac=frac, random_state=random_state)
# dataSetNormalized = dataSetNormalized.reset_index()

# dataSetNormalized = dataSetNormalized.drop(["idNumber"], axis=1) #// For Glass

medoids = KMediods(dataSetNormalized, data.getDataClass(),
                   data.getDescreteAttributes(),
                   data.getContinuousAttributes(), data.getPredictionType(),
                   kValue, maxItter)

medoids.to_csv('./CSVOutput/' + "normalized" + dataSetName +
Esempio n. 4
0
metrics = []
fold = 0


test_set = test_set.reset_index(drop=True)
train_set = train_set.reset_index(drop=True)
ohe = OneHotEncoder()
discrete_attr = dataRetriever.getDescreteAttributes()
if dataRetriever.getDataClass() in discrete_attr:
    discrete_attr.remove(dataRetriever.getDataClass())

train_set = ohe.train_fit(train_set, discrete_attr)
test_set = ohe.fit(test_set)

#  Normalize Data
sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()])
train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit()
test_set[dataRetriever.getContinuousAttributes()] = sn.fit(test_set[dataRetriever.getContinuousAttributes()])

# Train network and change architecture in respect to data set
nn = NeuralNetwork(train_set, 2, [6,16], dataRetriever.getPredictionType(), dataRetriever.getDataClass())
fitness_matrix, average_fitness = nn._particle_swarm_optimize(70, max_iter=500)


predictions = nn._feed_forward(test_set.drop(dataRetriever.getDataClass(), axis=1), testing=True)

actual = test_set[dataRetriever.getDataClass()]
metrics = np.asarray(metrics)

fig, ax = plt.subplots(3)
ax[0].plot(fitness_matrix[:,0], label="1")
Esempio n. 5
0
class_col = dataRetriever.getDataClass()
# data[class_col] = np.log(data[class_col] + 0.001)

contAttr = dataRetriever.getContinuousAttributes()
discAttr = dataRetriever.getDescreteAttributes()
predictionType = dataRetriever.getPredictionType()

output_json = {}
iter_num = 0

for test, train in KFolds(data, 5, stratisfied=True, class_col=class_col):

    #KFolds doesn't have the capability of returning a validate set
    #K is set to desired k/2 and the validate set is half of the test set

    sn = StandardNormalizer(train[contAttr])
    train[contAttr] = sn.train_fit()

    test1 = test.sample(frac=0.5, random_state=13)
    test2 = test.drop(test1.index)

    train = train.reset_index(drop=True)
    test1 = test1.reset_index(drop=True)
    test2 = test2.reset_index(drop=True)

    test1[contAttr] = sn.fit(test1[contAttr])
    test2[contAttr] = sn.fit(test2[contAttr])

    k_vals = [1, 3, 5, 7, int(np.floor(np.sqrt(len(train))))]

    print(f"Fold {iter_num}")
Esempio n. 6
0
centroidsTrain = pd.read_csv("CSVOutput/normalizedabaloneKMeansClustered.csv")
medoidsTrain = pd.read_csv("CSVOutput/normalizedabaloneMedoidsClustered.csv")

contAttr = dataRetriever.getContinuousAttributes()
discAttr = dataRetriever.getDescreteAttributes()
predictionType = dataRetriever.getPredictionType()

output_json = {}
iter_num = 0

for test, train in tqdm(KFolds(data, 10), total=1):
    k_vals = [1, 3, 5, 7, int(np.floor(np.sqrt(len(train))))]

    #Normalize data
    sn = StandardNormalizer(train[contAttr + [class_col]])
    train[contAttr + [class_col]] = sn.train_fit()
    test[contAttr + [class_col]] = sn.fit(test[contAttr + [class_col]])

    # print("KNN")
    KNN = KNearestNeighbor(test.drop(class_col, axis=1),
                           train,
                           k_vals,
                           contAttr,
                           discAttr,
                           unknown_col=class_col,
                           predictionType=predictionType)
    # print("Cent")
    centKNN = KNearestNeighbor(test.drop(class_col, axis=1),
                               centroidsTrain, [1, 3, 5, 7, 10],
                               contAttr,