Esempio n. 1
0
    def get_result(self, problem, pr_number, skillName, use_hints=True):
        effects = self.get_prereq_effects(skillName)

        problem = 1-problem

        knowledge_p = self.knowledge + effects[0]
        # knowledge_p = problem + effects[0]
        speed_p = self.speed + effects[1]
        hint_p = self.hint + effects[2]

        # print [knowledge_p, self.knowledge_std]

        answer = du.clamp(np.random.normal(knowledge_p, self.knowledge_std), 0, 1)
        pr_hint = hint_p / pr_number

        hint = int(du.diceRoll(1000) < (pr_hint*1000))

        cor = 0
        if answer > problem:
            answer = du.clamp(np.random.normal(0.9, self.knowledge_std), 0, 1)
        else:
            answer *= ((0.3-(problem-0.3))/0.3)

        cor = int(du.diceRoll(1000) < answer*1000) * (1-(hint*int(use_hints)))
        time = du.clamp(np.random.normal(speed_p, self.speed_std), 0, 10000) * problem
        time += du.MAX(0, np.random.normal(Student.hint_time_offset,
                                           Student.hint_time_offset_std)) * hint

        return [cor, time, hint]
Esempio n. 2
0
def generate_data_package(fold: int, tenfolds: list, regression: bool,
                          du: DataUtility):
    test_data, test_labels = copy.deepcopy(tenfolds[fold])
    remaining_data = [
        x[0] for i, x in enumerate(copy.deepcopy(tenfolds)) if i != fold
    ]
    remaining_labels = [
        y[1] for i, y in enumerate(copy.deepcopy(tenfolds)) if i != fold
    ]
    #Store off a set of the remaining dataset
    training_data = np.concatenate(remaining_data, axis=1)
    #Store the remaining data set labels
    training_labels = np.concatenate(remaining_labels, axis=1)

    if regression == True:
        #The number of output nodes is 1
        output_size = 1
    #else it is a classification data set
    else:
        #Count the number of classes in the label data set
        output_size = du.CountClasses(training_labels)
        #Get the test data labels in one hot encoding
        test_labels = du.ConvertLabels(test_labels, output_size)
        #Get the Labels into a One hot encoding
        training_labels = du.ConvertLabels(training_labels, output_size)

    input_size = training_data.shape[0]
    return [
        test_data, test_labels, training_data, training_labels, output_size,
        input_size
    ]
def compress_json_files():
    print("Compressing JSON-files")

    for data_set_type in [DataSetType.TRAINING, DataSetType.TEST, DataSetType.RECORDED]:
        path = DataUtility.get_data_set_path(DataSetFormat.RAW, data_set_type)
        raw_filelist = DataUtility.generate_file_list(path)

        for file in raw_filelist:
            if Utility.is_file_already_compressed(file, data_set_type):
                continue

            Utility.compress_json_file(file, data_set_type)

    print("Finshed compressing!")
Esempio n. 4
0
def split_for_autoencoding(samples):
    ini_input = []
    ini_output = []
    rem_input = []
    rem_output = []

    for samp in samples:
        ini_input.append(samp[0])
        ini_output.append(samp[1])
        rem_input.append(samp[1])
        rem_output.append(samp[2])

    return du.convert_to_floats(ini_input), du.convert_to_floats(ini_output),\
           du.convert_to_floats(rem_input), du.convert_to_floats(rem_output)
Esempio n. 5
0
def compress_json_file(file, data_set_type):
    print("Compressing file: " + file.filename)
    raw_data = get_json_data_from_file(file)

    compressed_data = {}

    json_array_name_list = [
        Constant.JSON_EMG_ARRAY_NAME, Constant.JSON_ACC_ARRAY_NAME,
        Constant.JSON_GYR_ARRAY_NAME, Constant.JSON_ORI_ARRAY_NAME
    ]
    data_length_list = [
        Constant.DATA_LENGTH_EMG, Constant.DATA_LENGTH_ACC,
        Constant.DATA_LENGTH_GYR, Constant.DATA_LENGTH_ORI
    ]

    for json_array_name, data_length in zip(json_array_name_list,
                                            data_length_list):
        compressed_data[json_array_name] = {}
        # if file.is_recorded:
        #     transposed_raw_data = numpy.transpose(raw_data[json_array_name][Constant.JSON_ARRAY_DATA_TABLE_NAME][:data_length]).tolist()
        # else:
        #     transposed_raw_data = raw_data[json_array_name][Constant.JSON_ARRAY_DATA_TABLE_NAME][:data_length]
        transposed_raw_data = raw_data[json_array_name][
            Constant.JSON_ARRAY_DATA_TABLE_NAME][:data_length]
        compressed_data[json_array_name][
            Constant.JSON_ARRAY_DATA_TABLE_NAME] = transposed_raw_data

    compressed_file_path = DataUtility.get_data_set_path(
        DataSetFormat.COMPRESSED, data_set_type) + file.filename
    with open(compressed_file_path, 'w') as outfile:
        json.dump(compressed_data, outfile)
Esempio n. 6
0
    def load_unlabeled_data(filename,
                            primary_column,
                            secondary_column,
                            covariate_columns,
                            load_from_file=False):
        # load from file or rebuild dataset
        load = load_from_file

        data = None
        if not load:
            data, headers = du.loadCSVwithHeaders(filename)

            for i in range(0, len(headers)):
                print '{:>2}:  {:<18} {:<12}'.format(str(i), headers[i],
                                                     data[0][i])
        else:
            print 'Skipping dataset loading - using cached data instead'

        print '\ntransforming data to time series...'
        pdata, labels, grouping = RNN.build_sequences(data, primary_column,
                                                      secondary_column,
                                                      covariate_columns,
                                                      [1, 2])

        print '\nDataset Info:'
        print 'number of samples:', len(pdata)
        print 'sequence length of first sample:', len(pdata[0])
        print 'input nodes: ', len(pdata[0][0])

        return pdata, labels, grouping
Esempio n. 7
0
    def get_label_distribution(labels):
        flat_labels = RNN.flatten_sequence(labels)

        labels = du.transpose(flat_labels)

        dist = []
        for i in range(0, len(labels)):
            dist.append(
                (float(np.nansum(np.array(labels[i]))) / len(labels[i])))
        return dist
    def create_json_file(self, filename):
        print("Creating file:", filename)

        json_data = {}
        for sensor in range(Sensor.NUMBER_OF_SENSORS):
            json_array_name = Utility.get_json_array_name_for_sensor(sensor)
            json_data_table_name = Constant.JSON_ARRAY_DATA_TABLE_NAME

            json_data[json_array_name] = {}
            json_data[json_array_name][
                json_data_table_name] = self.get_sensor_data(sensor)

        folder_path = DataUtility.get_data_set_path(DataSetFormat.RAW,
                                                    DataSetType.RECORDED)
        with open(folder_path + filename, 'w') as outfile:
            json.dump(json_data, outfile)

        o_file = DataUtility.File(folder_path, filename, None)

        return o_file
Esempio n. 9
0
    def get_prereq_effects(self, skillName):
        knowledge_effect = 0
        speed_effect = 0
        hint_effect = 0

        for sk in SkillLink.list:
            if sk.postreq == skillName and du.exists(sk.prereq,self.completed_assignments):
                knowledge_effect += sk.get_knowledge_effect()
                speed_effect += sk.get_speed_effect()
                hint_effect += sk.get_hint_effect()

        return [knowledge_effect, speed_effect, hint_effect]
Esempio n. 10
0
    def get_result(self, problem, pr_number, skillName, use_hints=True):
        effects = self.get_prereq_effects(skillName)

        problem = 1 - problem

        knowledge_p = self.knowledge + effects[0]
        # knowledge_p = problem + effects[0]
        speed_p = self.speed + effects[1]
        hint_p = self.hint + effects[2]

        # print [knowledge_p, self.knowledge_std]

        answer = du.clamp(np.random.normal(knowledge_p, self.knowledge_std), 0,
                          1)
        pr_hint = hint_p / pr_number

        hint = int(du.diceRoll(1000) < (pr_hint * 1000))

        cor = 0
        if answer > problem:
            answer = du.clamp(np.random.normal(0.9, self.knowledge_std), 0, 1)
        else:
            answer *= ((0.3 - (problem - 0.3)) / 0.3)

        cor = int(
            du.diceRoll(1000) < answer * 1000) * (1 - (hint * int(use_hints)))
        time = du.clamp(np.random.normal(speed_p, self.speed_std), 0,
                        10000) * problem
        time += du.MAX(
            0,
            np.random.normal(Student.hint_time_offset,
                             Student.hint_time_offset_std)) * hint

        return [cor, time, hint]
Esempio n. 11
0
    def get_prereq_effects(self, skillName):
        knowledge_effect = 0
        speed_effect = 0
        hint_effect = 0

        for sk in SkillLink.list:
            if sk.postreq == skillName and du.exists(
                    sk.prereq, self.completed_assignments):
                knowledge_effect += sk.get_knowledge_effect()
                speed_effect += sk.get_speed_effect()
                hint_effect += sk.get_hint_effect()

        return [knowledge_effect, speed_effect, hint_effect]
Esempio n. 12
0
def generate_data_package(fold: int, tenfolds: list, regression: bool, du: DataUtility):
    # get the fold we are going to use for testing 
    test_data, test_labels = copy.deepcopy(tenfolds[fold])
    # squish the rest of the data and ground truth labels into one numpy array, respectively
    remaining_data = [x[0] for i, x in enumerate(copy.deepcopy(tenfolds)) if i!=fold]
    remaining_labels = [y[1] for i, y in enumerate(copy.deepcopy(tenfolds)) if i!=fold]
    training_data = np.concatenate(remaining_data, axis=1) 
    training_labels = np.concatenate(remaining_labels, axis=1)
    # determine how many output nodes the network has (1 if regression)
    if regression == True:
        #The number of output nodes is 1 
        output_size = 1
    #else it is a classification data set 
    else:
        #Count the number of classes in the label data set 
        output_size = du.CountClasses(training_labels)
        #Get the test data labels in one hot encoding 
        test_labels = du.ConvertLabels(test_labels, output_size)
        #Get the Labels into a One hot encoding 
        training_labels = du.ConvertLabels(training_labels, output_size)

    input_size = training_data.shape[0]
    return [test_data, test_labels, training_data, training_labels, output_size, input_size]
Esempio n. 13
0
    def print_label_distribution(labels, label_names=None):
        print "\nLabel Distribution:"

        flat_labels = RNN.flatten_sequence(labels)
        labels = du.transpose(flat_labels)

        if label_names is not None:
            assert len(label_names) == len(labels)
        else:
            label_names = []
            for i in range(0, len(labels)):
                label_names[i] = "Label_" + str(i)

        for i in range(0, len(labels)):
            print "   " + label_names[i] + ":", "{:<6}".format(np.nansum(np.array(labels[i]))), \
                "({0:.0f}%)".format((float(np.nansum(np.array(labels[i]))) / len(labels[i])) * 100)
Esempio n. 14
0
def add_representation(data,labels,label_column,duplicate=10,threshold=0.0):
    assert len(data) == len(labels)
    print "Adding Representation to label:",label_column
    ndata = []
    nlabel = []
    for i in range(0,len(data)):
        represent = 1
        if labels[i] is list:
            if np.nanmean(labels[i], 0)[label_column] > threshold:
                represent = duplicate
        else:
            if labels[i][label_column] > threshold:
                represent = duplicate

        for j in range(0,represent):
            ndata.append(data[i])
            nlabel.append(labels[i])

    ndata,nlabel = du.shuffle(ndata,nlabel)
    return np.array(ndata),np.array(nlabel)
Esempio n. 15
0
    def test(self, samples, test_labels,label_names=None):
        # test each using held-out data
        test = samples

        # if test_labels is None:
        #     return self.predict(test_samples)

        label_test = test_labels
        print("\nTesting...")
        print "Test Samples:", len(test)

        classes = []
        p_count = 0

        avg_class_err = []
        avg_err = self.test_network(test, label_test)

        predictions = self.predict_network(test)

        for i in range(0, len(label_test)):
            p_count += 1
            classes.append(label_test[i].tolist())


        predictions = np.round(predictions, 3).tolist()

        actual = []
        pred = []
        cor = []

        # get the percent correct for the predictions
        # how often the prediction is right when it is made
        for i in range(0, len(predictions)):
            c = classes[i].index(max(classes[i]))
            actual.append(c)

            p = predictions[i].index(max(predictions[i]))
            pred.append(p)
            cor.append(int(c == p))

        # calculate a naive unfair baseline using averages
        avg_class_pred = np.mean(label_test, 0)

        print "Predicting:", avg_class_pred, "for baseline*"
        for i in range(0, len(label_test)):
            res = FFNNet.AverageCrossEntropy(np.array(avg_class_pred), np.array(classes[i]))
            avg_class_err.append(res)
            # res = RNN_GRU.AverageCrossEntropy(np.array(predictions_GRU[i]), np.array(classes[i]))
            # avg_err_GRU.append(res)
        print "*This is calculated from the TEST labels"

        from sklearn.metrics import roc_auc_score, f1_score
        from skll.metrics import kappa

        kpa = []
        auc = []
        f1s = []
        t_pred = du.transpose(predictions)
        t_lab = du.transpose(label_test)

        for i in range(0, len(t_lab)):
            # if i == 0 or i == 3:
            #    t_pred[i] = du.normalize(t_pred[i],method='max')
            kpa.append(kappa(t_lab[i], t_pred[i]))
            auc.append(roc_auc_score(t_lab[i], t_pred[i]))
            temp_p = [round(j) for j in t_pred[i]]
            if np.nanmax(temp_p) == 0:
                f1s.append(0)
            else:
                f1s.append(f1_score(t_lab[i], temp_p))

        print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_class_err))
        print "\nNetwork Performance:"
        print "Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_err))
        print "AUC:", "{0:.4f}".format(np.nanmean(auc))
        print "Kappa:", "{0:.4f}".format(np.nanmean(kpa))
        print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s))
        print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor) * 100)

        print "\n{:<15}".format("  Label"), \
            "{:<9}".format("  AUC"), \
            "{:<9}".format("  Kappa"), \
            "{:<9}".format("  F Stat"), \
            "\n=============================================="

        if label_names is None or len(label_names) != len(t_lab):
            label_names = []
            for i in range(0, len(t_lab)):
                label_names.append("Label " + str(i + 1))

        for i in range(0, len(t_lab)):
            print "{:<15}".format(label_names[i]), \
                "{:<9}".format("  {0:.4f}".format(auc[i])), \
                "{:<9}".format("  {0:.4f}".format(kpa[i])), \
                "{:<9}".format("  {0:.4f}".format(f1s[i]))
        print "\n=============================================="
        actual = []
        predicted = []
        for i in range(0, len(predictions)):
            actual.append(label_test[i].tolist().index(max(label_test[i])))
            predicted.append(predictions[i].index(max(predictions[i])))

        from sklearn.metrics import confusion_matrix
        print confusion_matrix(actual, predicted)

        return predictions
Esempio n. 16
0
 def __init__(self, name, difficulty=0.5, difficulty_std=0.1):
     self.problems = []
     self.name = name
     for i in range(0,10000):
         self.problems.append(du.clamp(np.random.normal(difficulty, difficulty_std), 0, 1))
Esempio n. 17
0
 def next_problem(self):
     return self.problems[du.rand(0, len(self.problems))]
Esempio n. 18
0
    return du.convert_to_floats(ini_input), du.convert_to_floats(ini_output),\
           du.convert_to_floats(rem_input), du.convert_to_floats(rem_output)


if __name__ == "__main__":

    # load training and test data

    training = []
    tr_label = []
    testing = []
    test_label = []

    samples,labels = load_skill_data('simulated_data.csv','simulated_hierarchy.csv','simulated_hierarchy_nonlink.csv')
    tr_samples, t_samples, tr_labels,t_labels = du.split_training_test(samples,labels)

    t_tr_labels = du.transpose(tr_labels)
    import math
    pre_rep = int(math.floor((len(t_tr_labels[0]) / np.nansum(t_tr_labels[0])) + 1))
    non_rep = int(math.floor((len(t_tr_labels[1]) / np.nansum(t_tr_labels[1])) + 1))
    rev_rep = int(math.floor((len(t_tr_labels[2]) / np.nansum(t_tr_labels[2])) + 1))

    print pre_rep, non_rep, rev_rep

    re_tr_samples, re_tr_labels = add_representation(tr_samples, tr_labels, 0, pre_rep)
    re_tr_samples, re_tr_labels = add_representation(re_tr_samples, re_tr_labels, 1, non_rep)
    re_tr_samples, re_tr_labels = add_representation(re_tr_samples, re_tr_labels, 2, rev_rep)

    re_tr_samples, re_tr_labels = du.sample(re_tr_samples,re_tr_labels,p=0.2)
Esempio n. 19
0
def main():
    print("Program Start")
    headers = [
        "Data set", "layers", "pop", "Beta", "CR", "generations", "loss1",
        "loss2"
    ]
    filename = 'DE_experimental_resultsFINAL.csv'

    Per = Performance.Results()
    Per.PipeToFile([], headers, filename)

    data_sets = [
        "soybean", "glass", "abalone", "Cancer", "forestfires", "machine"
    ]

    regression_data_set = {
        "soybean": False,
        "Cancer": False,
        "glass": False,
        "forestfires": True,
        "machine": True,
        "abalone": True
    }
    categorical_attribute_indices = {
        "soybean": [],
        "Cancer": [],
        "glass": [],
        "forestfires": [],
        "machine": [],
        "abalone": []
    }

    tuned_0_hl = {
        "soybean": {
            "omega": .5,
            "c1": .1,
            "c2": 5,
            "hidden_layer": []
        },
        "Cancer": {
            "omega": .5,
            "c1": .5,
            "c2": 5,
            "hidden_layer": []
        },
        "glass": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": []
        },
        "forestfires": {
            "omega": .2,
            "c1": 5,
            "c2": .5,
            "hidden_layer": []
        },
        "machine": {
            "omega": .5,
            "c1": .9,
            "c2": 5,
            "hidden_layer": []
        },
        "abalone": {
            "omega": .2,
            "c1": 5,
            "c2": .9,
            "hidden_layer": []
        }
    }

    tuned_1_hl = {
        "soybean": {
            "omega": .5,
            "c1": .5,
            "c2": 1,
            "hidden_layer": [7]
        },
        "Cancer": {
            "omega": .2,
            "c1": .5,
            "c2": 5,
            "hidden_layer": [4]
        },
        "glass": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": [8]
        },
        "forestfires": {
            "omega": .2,
            "c1": 5,
            "c2": 5,
            "hidden_layer": [8]
        },
        "machine": {
            "omega": .5,
            "c1": 5,
            "c2": .5,
            "hidden_layer": [4]
        },
        "abalone": {
            "omega": .2,
            "c1": .1,
            "c2": 5,
            "hidden_layer": [8]
        }
    }

    tuned_2_hl = {
        "soybean": {
            "omega": .5,
            "c1": .9,
            "c2": .1,
            "hidden_layer": [7, 12]
        },
        "Cancer": {
            "omega": .2,
            "c1": .5,
            "c2": 5,
            "hidden_layer": [4, 4]
        },
        "glass": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": [8, 6]
        },
        "forestfires": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": [8, 8]
        },
        "machine": {
            "omega": .2,
            "c1": .9,
            "c2": .1,
            "hidden_layer": [7, 2]
        },
        "abalone": {
            "omega": .2,
            "c1": 5,
            "c2": 5,
            "hidden_layer": [6, 8]
        }
    }
    du = DataUtility.DataUtility(categorical_attribute_indices,
                                 regression_data_set)
    total_counter = 0
    for data_set in data_sets:
        data_set_counter = 0
        # ten fold data and labels is a list of [data, labels] pairs, where
        # data and labels are numpy arrays:
        tenfold_data_and_labels = du.Dataset_and_Labels(data_set)

        for j in range(10):
            test_data, test_labels = copy.deepcopy(tenfold_data_and_labels[j])
            #Append all data folds to the training data set
            remaining_data = [
                x[0] for i, x in enumerate(tenfold_data_and_labels) if i != j
            ]
            remaining_labels = [
                y[1] for i, y in enumerate(tenfold_data_and_labels) if i != j
            ]
            #Store off a set of the remaining dataset
            X = np.concatenate(remaining_data, axis=1)
            #Store the remaining data set labels
            labels = np.concatenate(remaining_labels, axis=1)
            print(data_set, "training data prepared")
            regression = regression_data_set[data_set]
            #If the data set is a regression dataset
            if regression == True:
                #The number of output nodes is 1
                output_size = 1
            #else it is a classification data set
            else:
                #Count the number of classes in the label data set
                output_size = du.CountClasses(labels)
                #Get the test data labels in one hot encoding
                test_labels = du.ConvertLabels(test_labels, output_size)
                #Get the Labels into a One hot encoding
                labels = du.ConvertLabels(labels, output_size)

            input_size = X.shape[0]

            data_set_size = X.shape[1] + test_data.shape[1]

            tuned_parameters = [
                tuned_0_hl[data_set], tuned_1_hl[data_set],
                tuned_2_hl[data_set]
            ]
            for z in range(3):
                hidden_layers = tuned_parameters[z]["hidden_layer"]

                layers = [input_size] + hidden_layers + [output_size]

                nn = NeuralNetwork(input_size, hidden_layers, regression,
                                   output_size)
                nn.set_input_data(X, labels)

                total_weights = 0
                for i in range(len(layers) - 1):
                    total_weights += layers[i] * layers[i + 1]

                hyperparameters = {
                    "population_size": 10 * total_weights,
                    "beta": .5,
                    "crossover_rate": .6,
                    "max_gen": 100
                }
                hyperparameterss = {
                    "maxGen": 100,
                    "pop_size": 100,
                    "mutation_rate": .5,
                    "mutation_range": 10,
                    "crossover_rate": .5
                }
                hyperparametersss = {
                    "position_range": 10,
                    "velocity_range": 1,
                    "omega": .1,
                    # tuned_parameters[z]["omega"],
                    "c1": .9,
                    # tuned_parameters[z]["c1"],
                    "c2": .1,
                    # tuned_parameters[z]["c2"],
                    "vmax": 1,
                    "pop_size": 1000,
                    "max_t": 50
                }
                de = DE.DE(hyperparameters, total_weights, nn)
                ga = GA.GA(hyperparameterss, total_weights, nn)
                pso = PSO.PSO(layers, hyperparametersss, nn)
                learning_rate = 3
                momentum = 0
                VNN = VideoNN.NeuralNetworks(input_size, hidden_layers,
                                             regression, output_size,
                                             learning_rate, momentum)

                counter = 0
                print("DE OPERATIONS ")
                for gen in range(de.maxgens):
                    if counter == 1:
                        break
                    print("MUTATE AND CROSS OVER ")
                    de.Pmutate_and_crossover()
                    counter = counter + 1
                time.sleep(200)
                counter = 0
                print("GA OPERATIONS")
                for gen in range(ga.maxGen):
                    if counter == 1:
                        break
                    print()
                    ga.pfitness()
                    ga.Pselection()
                    ga.Pcrossover()
                    counter = counter + 1
                time.sleep(200)
                counter = 0
                print("PSO OPERATIONS")
                for epoch in range(pso.max_t):
                    if counter == 1:
                        break
                    pso.Pupdate_fitness()
                    pso.Pupdate_position_and_velocity()
                    counter = counter + 1
                time.sleep(200)
                # plt.plot(list(range(len(de.globalbest))), de.globalbest)
                # plt.draw()
                # plt.pause(0.00001)
                #plt.clf()
                # get the best overall solution and set the NN to those weights
                #DE
                bestSolution = de.bestChromie.getchromie()
                bestWeights = de.nn.weight_transform(bestSolution)
                de.nn.weights = bestWeights
                #GA

                #PS

                #   ################################ new code for de end ###################################
                # plt.ioff()
                # plt.plot(list(range(len(de.globalbest))), de.globalbest)
                # plt.show()
                # img_name = data_set + '_l' + str(len(hidden_layers)) + '_pr' + str(a) + '_vr' + str(b) + '_w' + str(c) + '_c' + str(d) + '_cc' + str(e) + '_v' + str(f) + '_ps' + str(g) + '.png'
                # plt.savefig('tuning_plots/' + img_name)
                # plt.clf()
                Estimation_Values = de.nn.classify(test_data, test_labels)
                if regression == False:
                    #Decode the One Hot encoding Value
                    Estimation_Values = de.nn.PickLargest(Estimation_Values)
                    test_labels_list = de.nn.PickLargest(test_labels)
                    # print("ESTiMATION VALUES BY GIVEN INDEX (CLASS GUESS) ")
                    # print(Estimation_Values)
                else:
                    Estimation_Values = Estimation_Values.tolist()
                    test_labels_list = test_labels.tolist()[0]
                    Estimation_Values = Estimation_Values[0]

                Estimat = Estimation_Values
                groun = test_labels_list

                Nice = Per.ConvertResultsDataStructure(groun, Estimat)
                # print("THE GROUND VERSUS ESTIMATION:")
                # print(Nice)

                # headers = ["Data set", "layers", "pop", "Beta", "CR", "generations", "loss1", "loss2"]
                Meta = [
                    data_set,
                    len(hidden_layers), hyperparameters["population_size"],
                    hyperparameters["beta"], hyperparameters["crossover_rate"],
                    hyperparameters["max_gen"]
                ]

                Per.StartLossFunction(regression, Nice, Meta, filename)
                print(f"{data_set_counter}/30 {data_set}. {total_counter}/180")
                data_set_counter += 1
                total_counter += 1
                print("DEMO FINISHED")
                time.sleep(10000)

    print("Program End ")
Esempio n. 20
0
def is_file_already_compressed(file, data_set_type):
    compressed_file_path = DataUtility.get_data_set_path(
        DataSetFormat.COMPRESSED, data_set_type) + file.filename
    return os.path.exists(compressed_file_path)
Esempio n. 21
0
    def test(self, test, test_labels=None, label_names=None):
        if test_labels is None:
            return self.predict(test)
        test_cpy = list(test)
        if not du.len_deepest(test_cpy) == self.num_input:
            if self.covariates is not None:
                for a in range(0, len(test_cpy)):
                    if type(test_cpy[a]) is not list:
                        test_cpy[a] = test_cpy[a].tolist()
                    for e in range(0, len(test_cpy[a])):
                        c = []
                        for i in range(0, len(self.covariates)):
                            c.append(test_cpy[a][e][self.covariates[i]])
                        test_cpy[a][e] = c

        if len(self.cov_mean) == 0 or len(self.cov_stdev) == 0:
            print "Scaling factors have not been generated: calculating using test sample"
            t_tr = du.transpose(RNN.flatten_sequence(test_cpy))
            self.cov_mean = []
            self.cov_stdev = []

            for a in range(0, len(t_tr)):
                mn = np.nanmean(t_tr[a])
                sd = np.nanstd(t_tr[a])
                self.cov_mean.append(mn)
                self.cov_stdev.append(sd)

        test_samples = []

        import math
        for a in range(0, len(test_cpy)):
            sample = []
            for e in range(0, len(test_cpy[a])):
                covariates = []
                for i in range(0, len(test_cpy[a][e])):
                    cov = 0
                    if self.cov_stdev[i] == 0:
                        cov = 0
                    else:
                        cov = (test_cpy[a][e][i] -
                               self.cov_mean[i]) / self.cov_stdev[i]

                    if math.isnan(cov) or math.isinf(cov):
                        cov = 0

                    covariates.append(cov)
                sample.append(covariates)
            test_samples.append(sample)

        label_test = test_labels
        print("\nTesting...")
        print "Test Samples:", len(test_samples)

        classes = []
        p_count = 0

        avg_class_err = []
        avg_err_RNN = []

        if self.scale_output:
            print "Scaling output..."

        predictions_RNN = []
        for i in range(0, len(test_samples)):
            # get the prediction and calculate cost
            prediction_RNN = self.pred_RNN([test_samples[i]])
            #prediction_RNN += .5-self.avg_preds
            if self.scale_output:
                prediction_RNN -= self.min_preds
                prediction_RNN /= (self.max_preds - self.min_preds)
                prediction_RNN = np.clip(prediction_RNN, 0, 1)
                prediction_RNN = [(x * [
                    1 if c == self.majorityclass else 0.9999
                    for c in range(0, self.num_output)
                ]) if np.sum(x) == 4 else x for x in prediction_RNN]
            avg_err_RNN.append(
                self.compute_cost_RNN([test_samples[i]], label_test[i]))

            for j in range(0, len(label_test[i])):
                p_count += 1

                classes.append(label_test[i][j].tolist())
                predictions_RNN.append(prediction_RNN[j].tolist())

        predictions_RNN = np.round(predictions_RNN, 3).tolist()

        actual = []
        pred_RNN = []
        cor_RNN = []

        # get the percent correct for the predictions
        # how often the prediction is right when it is made
        for i in range(0, len(predictions_RNN)):
            c = classes[i].index(max(classes[i]))
            actual.append(c)

            p_RNN = predictions_RNN[i].index(max(predictions_RNN[i]))
            pred_RNN.append(p_RNN)
            cor_RNN.append(int(c == p_RNN))

        # calculate a naive baseline using averages
        flattened_label = []
        for i in range(0, len(label_test)):
            for j in range(0, len(label_test[i])):
                flattened_label.append(label_test[i][j])
        flattened_label = np.array(flattened_label)
        avg_class_pred = np.mean(flattened_label, 0)

        print "Predicting:", avg_class_pred, "for baseline*"
        for i in range(0, len(flattened_label)):
            res = RNN.AverageCrossEntropy(np.array(avg_class_pred),
                                          np.array(classes[i]))
            avg_class_err.append(res)
            # res = RNN.AverageCrossEntropy(np.array(predictions_RNN[i]), np.array(classes[i]))
            # avg_err_RNN.append(res)
        print "*This is calculated from the TEST labels"

        from sklearn.metrics import roc_auc_score, f1_score
        from skll.metrics import kappa

        kpa = []
        auc = []
        f1s = []
        apr = []
        t_pred = du.transpose(predictions_RNN)
        t_lab = du.transpose(flattened_label)

        for i in range(0, len(t_lab)):
            #if i == 0 or i == 3:
            #    t_pred[i] = du.normalize(t_pred[i],method='max')
            temp_p = [round(j) for j in t_pred[i]]

            kpa.append(kappa(t_lab[i], t_pred[i]))
            apr.append(du.Aprime(t_lab[i], t_pred[i]))
            auc.append(roc_auc_score(t_lab[i], t_pred[i]))

            if np.nanmax(temp_p) == 0:
                f1s.append(0)
            else:
                f1s.append(f1_score(t_lab[i], temp_p))

        if label_names is None or len(label_names) != len(t_lab):
            label_names = []
            for i in range(0, len(t_lab)):
                label_names.append("Label " + str(i + 1))

        RNN.print_label_distribution(label_test, label_names)

        self.eval_metrics = [
            np.nanmean(avg_err_RNN),
            np.nanmean(auc),
            np.nanmean(kpa),
            np.nanmean(f1s),
            np.nanmean(cor_RNN) * 100
        ]

        print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format(
            np.nanmean(avg_class_err))
        print "\nNetwork Performance:"
        print "Average Cross-Entropy:", "{0:.4f}".format(
            np.nanmean(avg_err_RNN))
        print "AUC:", "{0:.4f}".format(np.nanmean(auc))
        print "A':", "{0:.4f}".format(np.nanmean(apr))
        print "Kappa:", "{0:.4f}".format(np.nanmean(kpa))
        print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s))
        print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor_RNN) * 100)

        print "\n{:<15}".format("  Label"), \
            "{:<9}".format("  AUC"), \
            "{:<9}".format("  A'"), \
            "{:<9}".format("  Kappa"), \
            "{:<9}".format("  F Stat"), \
            "\n=============================================="

        for i in range(0, len(t_lab)):
            print "{:<15}".format(label_names[i]), \
                "{:<9}".format("  {0:.4f}".format(auc[i])), \
                "{:<9}".format("  {0:.4f}".format(apr[i])), \
                "{:<9}".format("  {0:.4f}".format(kpa[i])), \
                "{:<9}".format("  {0:.4f}".format(f1s[i]))
        print "\n=============================================="

        print "Confusion Matrix:"
        actual = []
        predicted = []
        flattened_label = flattened_label.tolist()
        for i in range(0, len(predictions_RNN)):
            actual.append(flattened_label[i].index(max(flattened_label[i])))
            predicted.append(predictions_RNN[i].index(max(predictions_RNN[i])))

        from sklearn.metrics import confusion_matrix
        conf_mat = confusion_matrix(actual, predicted)
        for cm in conf_mat:
            cm_row = "\t"
            for element in cm:
                cm_row += "{:<6}".format(element)
            print cm_row
        print "\n=============================================="

        return predictions_RNN
Esempio n. 22
0
    def predict(self, test):
        test_cpy = list(test)
        if not du.len_deepest(test_cpy) == self.num_input:
            if self.covariates is not None:
                for a in range(0, len(test_cpy)):
                    if type(test_cpy[a]) is not list:
                        test_cpy[a] = test_cpy[a].tolist()
                    for e in range(0, len(test[a])):
                        c = []
                        for i in range(0, len(self.covariates)):
                            c.append(test_cpy[a][e][self.covariates[i]])
                        test_cpy[a][e] = c

        if len(self.cov_mean) == 0 or len(self.cov_stdev) == 0:
            print "Scaling factors have not been generated: calculating using test sample"
            t_tr = du.transpose(RNN.flatten_sequence(test_cpy))
            self.cov_mean = []
            self.cov_stdev = []

            for a in range(0, len(t_tr)):
                mn = np.nanmean(t_tr[a])
                sd = np.nanstd(t_tr[a])
                self.cov_mean.append(mn)
                self.cov_stdev.append(sd)

        test_samples = []

        import math
        for a in range(0, len(test_cpy)):
            sample = []
            for e in range(0, len(test_cpy[a])):
                covariates = []
                for i in range(0, len(test_cpy[a][e])):
                    cov = 0
                    if self.cov_stdev[i] == 0:
                        cov = 0
                    else:
                        cov = (test_cpy[a][e][i] -
                               self.cov_mean[i]) / self.cov_stdev[i]

                    if math.isnan(cov) or math.isinf(cov):
                        cov = 0

                    covariates.append(cov)
                sample.append(covariates)
            test_samples.append(sample)

        if self.scale_output:
            print "Scaling output..."

        predictions_RNN = []
        for i in range(0, len(test_samples)):
            # get the prediction and calculate cost
            prediction_RNN = self.pred_RNN([test_samples[i]])
            if self.scale_output:
                prediction_RNN -= self.min_preds
                prediction_RNN /= (self.max_preds - self.min_preds)
                prediction_RNN = np.clip(prediction_RNN, 0, 1)

                prediction_RNN = [(x * [
                    1 if c == self.majorityclass else 0.9999
                    for c in range(0, self.num_output)
                ]) if np.sum(x) == 4 else x for x in prediction_RNN]

            for j in range(0, len(prediction_RNN)):
                predictions_RNN.append(prediction_RNN[j].tolist())

        predictions_RNN = np.round(predictions_RNN, 3).tolist()

        return predictions_RNN
Esempio n. 23
0
def run():
    # Obtain current date
    current_date = datetime.date.today().strftime('%Y-%m-%d')

    # Get top level directory
    top_level_directory = get_top_level_directory_path()

    # Load configuration files for program and Twitter authentication
    main_config = cu.Config(top_level_directory +
                            '/src/twitter_updates/TwitterUpdateConfig.dat')
    auth_config = cu.Config(top_level_directory +
                            main_config.get_value('TwitterAuth'))

    # Remove any old files from /res/raw_images
    clean_dir(main_config.get_value('RawImages'))

    # Authenticate Twitter API session
    twitter_session = TwitterAPISession(auth_config)

    # Store values read by OCR algorithm in a dictionary
    input_data = {\
        'Date' : current_date,
        'Cases' : 0,
        'Deaths' : 0,
        'Tests' : 0,
        'Recovered' : 0,
        'Hospitalized' : 0,
        'Cases24H' : 0
    }

    # Remove any old files from /res/raw_images
    clean_dir(main_config.get_value('RawImages'))

    # Open temporary command line to check if data is correct
    check_data_menu(input_data)

    # Load simple Peru data set
    PER_data = du.Table('l',
                        filename=top_level_directory +
                        main_config.get_value('PeruSimpleData'))

    # Agregate new data entry
    PER_data.append_entry({
        'Fecha': input_data['Date'],
        'Casos': int(input_data['Cases']),
        'Fallecidos': int(input_data['Deaths']),
        'Pruebas': int(input_data['Tests']),
        'Recuperados': int(input_data['Recovered']),
        'Hospitalizados': int(input_data['Hospitalized'])
    })

    # Save simple Peru data set
    PER_data.save_as_csv(top_level_directory +
                         main_config.get_value('PeruSimpleData'))

    # Create copy of simple Peru data set to perform extrapolation
    PER_full_data = du.Table('c', table=PER_data)

    # Compute new derived statistics
    PER_full_data.compute_add_column(['Casos'], compute_new_cases,
                                     'NuevosCasos')
    PER_full_data.compute_add_column(['Casos'], compute_cases_growth_factor,
                                     '%DifCasos')
    PER_full_data.compute_add_column(['Casos', 'Recuperados', 'Fallecidos'],
                                     compute_active_cases, 'CasosActivos')
    PER_full_data.compute_add_column(['CasosActivos'],
                                     compute_new_active_cases,
                                     'NuevosCasosActivos')
    PER_full_data.compute_add_column(['Fallecidos'], compute_new_deaths,
                                     'NuevosFallecidos')
    PER_full_data.compute_add_column(['Fallecidos'],
                                     compute_deaths_growth_factor,
                                     '%DifFallecidos')
    PER_full_data.compute_add_column(['Casos', 'Fallecidos'],
                                     compute_case_fatality_rate,
                                     'TasaLetalidad')
    PER_full_data.compute_add_column(['Pruebas'], compute_new_tests,
                                     'NuevasPruebas')
    PER_full_data.compute_add_column(['Pruebas'], compute_tests_growth_factor,
                                     '%DifPruebas')
    PER_full_data.compute_add_column(['NuevasPruebas', 'NuevosCasos'],
                                     compute_daily_positivity_rate,
                                     '%PruebasPositivasDiarias')
    PER_full_data.compute_add_column(['Recuperados'], compute_new_recovered,
                                     'NuevosRecuperados')
    PER_full_data.compute_add_column(['Recuperados'],
                                     compute_tests_growth_factor,
                                     '%DifRecuperados')
    PER_full_data.compute_add_column(['Hospitalizados'],
                                     compute_new_hospitalized,
                                     'NuevosHospitalizados')
    PER_full_data.compute_add_column(['Hospitalizados'],
                                     compute_hospitalized_growth_factor,
                                     '%DifHospitalizados')
    PER_full_data.compute_add_column([], compute_days, 'Dia')

    # Reorganize header index before saving
    new_header = {
        0: 'Fecha',
        1: 'Dia',
        2: 'Casos',
        3: 'NuevosCasos',
        4: '%DifCasos',
        5: 'CasosActivos',
        6: 'NuevosCasosActivos',
        7: 'Fallecidos',
        8: 'NuevosFallecidos',
        9: '%DifFallecidos',
        10: 'TasaLetalidad',
        11: 'Pruebas',
        12: 'NuevasPruebas',
        13: '%DifPruebas',
        14: '%PruebasPositivasDiarias',
        15: 'Recuperados',
        16: 'NuevosRecuperados',
        17: '%DifRecuperados',
        18: 'Hospitalizados',
        19: 'NuevosHospitalizados',
        20: '%DifHospitalizados'
    }

    # Rearrange header index in Peru full data
    PER_full_data.rearrange_header_index(new_header)

    # Save full Peru data set
    PER_full_data.save_as_csv(top_level_directory +
                              main_config.get_value('PeruFullData'))

    # Create quadplot object for first tweet
    quadplot_1 = pu.QuadPlot(
        [
            main_config.get_value('CasesColor'),
            main_config.get_value('CasesColor'),
            main_config.get_value('RecoveredColor'),
            main_config.get_value('HospitalizedColor')
        ], [
            'Casos Confirmados (ultimos 30 dias)',
            'Nuevos Casos Confirmados (ultimos 30 dias)',
            'Nuevos Recuperados (ultimos 30 dias)',
            'Hospitalizados (ultimos 30 dias)'
        ], [False, True, True, True], ['bar', 'bar', 'bar', 'bar'], [
            'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)',
            'Fecha (YYYY-MM-DD)'
        ], [
            'Casos Confirmados (acumulado por dia)',
            'Nuevos Casos Confirmados (por dia)',
            'Nuevos Recuperados (por dia)', 'Hospitalizados (por dia)'
        ], [
            PER_full_data.get_column('Fecha')[-30:],
            PER_full_data.get_column('Fecha')[-30:],
            PER_full_data.get_column('Fecha')[-30:],
            PER_full_data.get_column('Fecha')[-30:]
        ], [
            PER_full_data.get_column('Casos')[-30:],
            PER_full_data.get_column('NuevosCasos')[-30:],
            PER_full_data.get_column('NuevosRecuperados')[-30:],
            PER_full_data.get_column('Hospitalizados')[-30:]
        ],
        current_date +
        ' | Elaborado por Kurt Manrique-Nino | Datos del Ministerio de Salud del Peru (@Minsa_Peru)',
        top_level_directory + main_config.get_value('TwitterGraph1'),
        ravg_days=[7, 7, 7, 7],
        ravg_labels=[
            'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias',
            'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias'
        ],
        ravg_ydata=[
            None,
            PER_full_data.get_column('NuevosCasos'),
            PER_full_data.get_column('NuevosRecuperados'),
            PER_full_data.get_column('Hospitalizados')
        ])

    # Create quadplot object for second tweet
    quadplot_2 = pu.QuadPlot(
        [
            main_config.get_value('DeathsColor'),
            main_config.get_value('DeathsColor'),
            main_config.get_value('TestsColor'),
            main_config.get_value('TestsColor')
        ], [
            'Nuevos Fallecidos (ultimos 30 dias)',
            'Tasa de Letalidad (ultimos 30 dias)',
            'Nuevas Pruebas (PM+PR+AG) (ultimos 30 dias)',
            'Positividad Diaria (PM+PR+AG) (ultimos 30 dias)'
        ], [True, True, True, True], ['bar', 'scatter', 'bar', 'scatter'], [
            'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)',
            'Fecha (YYYY-MM-DD)'
        ], [
            'Nuevos Fallecidos (por dia)',
            'Tasa de Letalidad (acumulado por dia)',
            'Nuevas Pruebas (por dia)', 'Positividad Diaria * 100% (PM+PR+AG)'
        ], [
            PER_full_data.get_column('Fecha')[-30:],
            PER_full_data.get_column('Fecha')[-30:],
            PER_full_data.get_column('Fecha')[-30:],
            PER_full_data.get_column('Fecha')[-30:]
        ], [
            PER_full_data.get_column('NuevosFallecidos')[-30:],
            PER_full_data.get_column('TasaLetalidad')[-30:],
            PER_full_data.get_column('NuevasPruebas')[-30:],
            PER_full_data.get_column('%PruebasPositivasDiarias')[-30:]
        ],
        current_date +
        ' | Elaborado por Kurt Manrique-Nino | Datos del Ministerio de Salud del Peru (@Minsa_Peru)',
        top_level_directory + main_config.get_value('TwitterGraph2'),
        ravg_days=[7, 7, 7, 7],
        ravg_labels=[
            'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias',
            'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias'
        ],
        ravg_ydata=[
            PER_full_data.get_column('NuevosFallecidos'),
            PER_full_data.get_column('TasaLetalidad'),
            PER_full_data.get_column('NuevasPruebas'),
            PER_full_data.get_column('%PruebasPositivasDiarias')
        ])

    # Generate and store quadplot
    quadplot_1.export()

    # Generate and store quadplot
    quadplot_2.export()

    # Obtain the last entry of Peru full data
    latest_entry = PER_full_data.get_latest_entry()

    # Create instances of tweets to store text and image paths
    tweet1 = Tweet()
    tweet2 = Tweet()

    # Create and add tweet body for first tweet
    tweet1.set_message(
        generate_first_tweet_text(
            top_level_directory + main_config.get_value('TwTemplate1'),
            latest_entry, int(input_data['Cases24H'])))

    # Create and add tweet body for second tweet
    tweet2.set_message(
        generate_second_tweet_text(
            top_level_directory + main_config.get_value('TwTemplate2'),
            latest_entry,
            PER_full_data.col_row_query('TasaLetalidad',
                                        PER_full_data.rows - 2),
            PER_full_data.col_row_query('%PruebasPositivasDiarias',
                                        PER_full_data.rows - 2)))

    # Add paths to graph images
    tweet1.add_image(top_level_directory +
                     main_config.get_value('TwitterGraph1'))
    tweet2.add_image(top_level_directory +
                     main_config.get_value('TwitterGraph2'))

    # Export tweet messages into a file
    export_tweets_to_file(
        top_level_directory + main_config.get_value('TweetExport'),
        [tweet1, tweet2])

    # Reply to @Minsa_Peru with tweet thread
    twitter_session.send_thread([tweet1, tweet2])

    # Update GitHub repository with new data
    if (sys.platform == 'win32'):
        update_git_repo_win32(input_data['Date'])
    else:
        update_git_repo_linux(input_data['Date'])
Esempio n. 24
0
def load_skill_data(data_filename, prereq_file, nolink_file):
    print "Loading Data..."
    data, headers = du.loadCSVwithHeaders(data_filename)
    prereqs = du.loadCSV(prereq_file)
    nolink = du.loadCSV(nolink_file)

    samples = []
    labels = []

    for i in range(0, len(headers)):
        print '{:>2}:  {:<18} {:<12}'.format(str(i), headers[i], data[0][i])

    print "Hierarchy Structure:"
    for p in prereqs:
        print p[0], '->', p[1]

    students = du.unique(du.transpose(data)[2])
    for i in range(0, len(students)):
        student_set = du.select(data, students[i], '==', 2)

        for p in prereqs:
            post = du.select(student_set, p[1], '==', 0)
            if not len(post) == 0:

                post = post[0]
                pre = du.select(du.select(student_set, p[0], '==', 0), post[1], '<', 1)
                rem = du.select(du.select(student_set, p[0], '==', 0), post[1], '>', 1)

                if not (len(pre) == 0 or len(rem) == 0):
                    pre = pre[0]
                    rem = rem[0]

                    samp_pre = []
                    samp_post = []
                    samp_rem = []
                    for j in range(3, 8):
                        samp_pre.append(pre[j])
                        samp_post.append(post[j])
                        samp_rem.append(rem[j])

                    samples.append([samp_pre, samp_post, samp_rem, [p[0], p[1]]])
                    labels.append([1, 0, 0])

                    # print pre
                    # print post
                    # print rem
                    # print ' '

            post = du.select(student_set, p[0], '==', 0)
            if not len(post) == 0:
                post = post[-1]
                pre = du.select(du.select(student_set, p[1], '==', 0), post[1], '<', 1)
                rem = du.select(du.select(student_set, p[1], '==', 0), post[1], '>', 1)

                if not (len(pre) == 0 or len(rem) == 0):
                    pre = pre[0]
                    rem = rem[0]

                    samp_pre = []
                    samp_post = []
                    samp_rem = []
                    for j in range(3, 8):
                        samp_pre.append(pre[j])
                        samp_post.append(post[j])
                        samp_rem.append(rem[j])

                    samples.append([samp_pre, samp_post, samp_rem, [p[1], p[0]]])
                    labels.append([0, 0, 1])

                    # print pre
                    # print post
                    # print rem
                    # print ' '
        for p in nolink:
            post = du.select(student_set, p[1], '==', 0)
            if not len(post) == 0:
                post = post[0]
                pre = du.select(du.select(student_set, p[0], '==', 0), post[1], '<', 1)
                rem = du.select(du.select(student_set, p[0], '==', 0), post[1], '>', 1)

                if not (len(pre) == 0 or len(rem) == 0):
                    pre = pre[0]
                    rem = rem[0]

                    samp_pre = []
                    samp_post = []
                    samp_rem = []
                    for j in range(3, 8):
                        samp_pre.append(pre[j])
                        samp_post.append(post[j])
                        samp_rem.append(rem[j])

                    samples.append([samp_pre, samp_post, samp_rem, [p[0], p[1]]])
                    labels.append([0, 1, 0])

                    # print pre
                    # print post
                    # print rem
                    # print ' '

# =================================================================

    if len(labels) == 0:
        print "\nNO USABLE SAMPLES EXIST"
        exit()

    du.print_label_distribution(labels, ['Prerequisite','Non-Link','Reversed'])
    samples,labels = du.shuffle(samples, labels)
    return samples,labels
Esempio n. 25
0
    def get_knowledge_effect(self):
        return du.MAX(
            np.random.normal(self.knowledge_effect, self.knowledge_effect_std),
            0)

    def get_speed_effect(self):
        return du.MAX(
            np.random.normal(self.speed_effect, self.speed_effect_std), 0)

    def get_hint_effect(self):
        return du.MAX(np.random.normal(self.hint_effect, self.hint_effect_std),
                      0)


if __name__ == "__main__":
    data, headers = du.loadCSVwithHeaders('filtered_data.csv')

    students = []
    num_students = 1000

    print "Generating data for", num_students, "students..."

    for i in range(0, num_students):
        index = du.rand(0, len(data))
        #self, hint, speed, knowledge, h_sd, s_sd, k_sd):
        students.append(
            Student(data[index][7], data[index][1], data[index][4],
                    data[index][8], data[index][2], data[index][5]))

    # difficulty is probability of correctness (higher is easier)
    A = Skill('A', 0.6, 0.05)
Esempio n. 26
0
def main():
    print("Program Start")
    headers = [
        "Data set", "layers", "pop", "Beta", "CR", "generations", "loss1",
        "loss2"
    ]
    filename = 'VIDEORESULTS.csv'

    Per = Performance.Results()
    Per.PipeToFile([], headers, filename)

    data_sets = [
        "soybean", "glass", "abalone", "Cancer", "forestfires", "machine"
    ]

    regression_data_set = {
        "soybean": False,
        "Cancer": False,
        "glass": False,
        "forestfires": True,
        "machine": True,
        "abalone": True
    }
    categorical_attribute_indices = {
        "soybean": [],
        "Cancer": [],
        "glass": [],
        "forestfires": [],
        "machine": [],
        "abalone": []
    }

    tuned_0_hl = {
        "soybean": {
            "omega": .5,
            "c1": .1,
            "c2": 5,
            "hidden_layer": []
        },
        "Cancer": {
            "omega": .5,
            "c1": .5,
            "c2": 5,
            "hidden_layer": []
        },
        "glass": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": []
        },
        "forestfires": {
            "omega": .2,
            "c1": 5,
            "c2": .5,
            "hidden_layer": []
        },
        "machine": {
            "omega": .5,
            "c1": .9,
            "c2": 5,
            "hidden_layer": []
        },
        "abalone": {
            "omega": .2,
            "c1": 5,
            "c2": .9,
            "hidden_layer": []
        }
    }

    tuned_1_hl = {
        "soybean": {
            "omega": .5,
            "c1": .5,
            "c2": 1,
            "hidden_layer": [7]
        },
        "Cancer": {
            "omega": .2,
            "c1": .5,
            "c2": 5,
            "hidden_layer": [4]
        },
        "glass": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": [8]
        },
        "forestfires": {
            "omega": .2,
            "c1": 5,
            "c2": 5,
            "hidden_layer": [8]
        },
        "machine": {
            "omega": .5,
            "c1": 5,
            "c2": .5,
            "hidden_layer": [4]
        },
        "abalone": {
            "omega": .2,
            "c1": .1,
            "c2": 5,
            "hidden_layer": [8]
        }
    }

    tuned_2_hl = {
        "soybean": {
            "omega": .5,
            "c1": .9,
            "c2": .1,
            "hidden_layer": [7, 12]
        },
        "Cancer": {
            "omega": .2,
            "c1": .5,
            "c2": 5,
            "hidden_layer": [4, 4]
        },
        "glass": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": [8, 6]
        },
        "forestfires": {
            "omega": .2,
            "c1": .9,
            "c2": 5,
            "hidden_layer": [8, 8]
        },
        "machine": {
            "omega": .2,
            "c1": .9,
            "c2": .1,
            "hidden_layer": [7, 2]
        },
        "abalone": {
            "omega": .2,
            "c1": 5,
            "c2": 5,
            "hidden_layer": [6, 8]
        }
    }
    du = DataUtility.DataUtility(categorical_attribute_indices,
                                 regression_data_set)
    total_counter = 0
    for data_set in data_sets:
        if data_set != 'Cancer':
            continue
        data_set_counter = 0
        # ten fold data and labels is a list of [data, labels] pairs, where
        # data and labels are numpy arrays:
        tenfold_data_and_labels = du.Dataset_and_Labels(data_set)

        for j in range(10):
            test_data, test_labels = copy.deepcopy(tenfold_data_and_labels[j])
            #Append all data folds to the training data set
            remaining_data = [
                x[0] for i, x in enumerate(tenfold_data_and_labels) if i != j
            ]
            remaining_labels = [
                y[1] for i, y in enumerate(tenfold_data_and_labels) if i != j
            ]
            #Store off a set of the remaining dataset
            X = np.concatenate(remaining_data, axis=1)
            #Store the remaining data set labels
            labels = np.concatenate(remaining_labels, axis=1)
            print(data_set, "training data prepared")
            regression = regression_data_set[data_set]
            #If the data set is a regression dataset
            if regression == True:
                #The number of output nodes is 1
                output_size = 1
            #else it is a classification data set
            else:
                #Count the number of classes in the label data set
                output_size = du.CountClasses(labels)
                #Get the test data labels in one hot encoding
                test_labels = du.ConvertLabels(test_labels, output_size)
                #Get the Labels into a One hot encoding
                labels = du.ConvertLabels(labels, output_size)

            input_size = X.shape[0]

            data_set_size = X.shape[1] + test_data.shape[1]

            tuned_parameters = [
                tuned_0_hl[data_set], tuned_1_hl[data_set],
                tuned_2_hl[data_set]
            ]
            for z in range(1):
                hidden_layers = tuned_parameters[z]["hidden_layer"]

                layers = [input_size] + hidden_layers + [output_size]

                nn = NeuralNetwork(input_size, hidden_layers, regression,
                                   output_size)
                nn.set_input_data(X, labels)
                nn1 = NeuralNetwork(input_size, hidden_layers, regression,
                                    output_size)
                nn1.set_input_data(X, labels)
                nn2 = NeuralNetwork(input_size, hidden_layers, regression,
                                    output_size)
                nn2.set_input_data(X, labels)

                total_weights = 0
                for i in range(len(layers) - 1):
                    total_weights += layers[i] * layers[i + 1]

                hyperparameters = {
                    "population_size": 10 * total_weights,
                    "beta": .5,
                    "crossover_rate": .6,
                    "max_gen": 100
                }
                hyperparameterss = {
                    "maxGen": 100,
                    "pop_size": 100,
                    "mutation_rate": .5,
                    "mutation_range": 10,
                    "crossover_rate": .5
                }
                hyperparametersss = {
                    "position_range": 10,
                    "velocity_range": 1,
                    "omega": .1,
                    # tuned_parameters[z]["omega"],
                    "c1": .9,
                    # tuned_parameters[z]["c1"],
                    "c2": .1,
                    # tuned_parameters[z]["c2"],
                    "vmax": 1,
                    "pop_size": 1000,
                    "max_t": 50
                }
                de = DE.DE(hyperparameters, total_weights, nn)
                ga = GA.GA(hyperparameterss, total_weights, nn1)
                pso = PSO.PSO(layers, hyperparametersss, nn2)
                learning_rate = 3
                momentum = 0
                VNN = VideoNN.NeuralNetworks(input_size, hidden_layers,
                                             regression, output_size,
                                             learning_rate, momentum)
                VNN.set_input_data(X, labels)

                for gen in range(de.maxgens):
                    de.mutate_and_crossover()

                for gen in range(ga.maxGen):
                    ga.fitness()
                    ga.selection()
                    ga.crossover()

                counter = 0
                for epoch in range(pso.max_t):
                    pso.update_fitness()
                    pso.update_position_and_velocity()

                for epoch in range(100):
                    VNN.forward_pass()
                    VNN.backpropagation_pass()

                bestSolution = de.bestChromie.getchromie()
                bestWeights = de.nn.weight_transform(bestSolution)
                de.nn.weights = bestWeights

                Estimation_Values = de.nn.classify(test_data, test_labels)
                Estimation_Values1 = ga.nn.classify(test_data, test_labels)
                Estimation_Values2 = pso.NN.classify(test_data, test_labels)
                Estimation_Values3 = VNN.classify(test_data, test_labels)
                if regression == False:
                    #Decode the One Hot encoding Value
                    Estimation_Values = de.nn.PickLargest(Estimation_Values)
                    test_labels_list = de.nn.PickLargest(test_labels)
                    Estimation_Values1 = ga.nn.PickLargest(Estimation_Values1)
                    Tll = ga.nn.PickLargest(test_labels)
                    Estimation_Values2 = pso.NN.PickLargest(Estimation_Values2)
                    tll1 = pso.NN.PickLargest(test_labels)
                    Estimation_Values3 = VNN.PickLargest(Estimation_Values3)
                    tll = VNN.PickLargest(test_labels)

                    # print("ESTiMATION VALUES BY GIVEN INDEX (CLASS GUESS) ")
                    # print(Estimation_Values)
                else:
                    Estimation_Values = Estimation_Values.tolist()
                    test_labels_list = test_labels.tolist()[0]
                    Estimation_Values = Estimation_Values[0]

                Estimat = Estimation_Values
                groun = test_labels_list

                meta = list()
                Nice = Per.ConvertResultsDataStructure(groun, Estimat)
                Nice1 = Per.ConvertResultsDataStructure(
                    Tll, Estimation_Values1)
                Nice2 = Per.ConvertResultsDataStructure(
                    tll1, Estimation_Values2)
                Nice3 = Per.ConvertResultsDataStructure(
                    tll, Estimation_Values3)
                DEss = Per.StartLossFunction(regression, Nice, meta)
                GAss = Per.StartLossFunction(regression, Nice1, meta)
                PSOSS = Per.StartLossFunction(regression, Nice2, meta)
                VNNS = Per.StartLossFunction(regression, Nice3, meta)
                print("DE")
                print(DEss)
                print("GA")
                print(GAss)
                print("PSO")
                print(PSOSS)
                print("NN Back prop.")
                print(VNNS)

                # print("THE GROUND VERSUS ESTIMATION:")
                # print(Nice)

                # headers = ["Data set", "layers", "pop", "Beta", "CR", "generations", "loss1", "loss2"]
                Meta = [
                    data_set,
                    len(hidden_layers), hyperparameters["population_size"],
                    hyperparameters["beta"], hyperparameters["crossover_rate"],
                    hyperparameters["max_gen"]
                ]

                Per.StartLossFunction(regression, Nice, Meta, filename)
                data_set_counter += 1
                total_counter += 1

    print("Program End ")
Esempio n. 27
0
    def train(self, training, output=None):
        if output is None:
            output = training

        assert len(training) == len(output)

        self.num_input = du.len_deepest(training)
        self.num_output = du.len_deepest(output)

        training = du.transpose(training)
        output = du.transpose(output)

        for i in range(0,len(training)):
            training[i] = du.normalize(training[i])
            output[i] = du.normalize(output[i])

        training = du.transpose(training)
        output = du.transpose(output)

        if not self.isBuilt:
            self.build_network()

        print "Input Nodes:", self.num_input
        print "Output Nodes:", self.num_output

        # introduce cross-validation
        from sklearn.cross_validation import StratifiedKFold

        strat_label = []
        for i in range(0, len(training)):
            strat_label.append(1)

        skf = StratifiedKFold(strat_label, n_folds=self.num_folds)

        print"Number of Folds:", len(skf)

        print "Training Samples:", len(training)

        print("\nTraining AutoEncoder...")
        print "{:<9}".format("  Epoch"), \
            "{:<9}".format("  Train"), \
            "{:<9}".format("  Valid"), \
            "{:<9}".format("  Time"), \
            "\n======================================"
        start_time = time.clock()
        train_err = []
        val_err = []
        # for each epoch...
        for e in range(0, self.num_epochs):
            epoch_time = time.clock()
            epoch = 0
            eval = 0
            n_train = 0
            n_test = 0

            # train and test
            for ktrain, ktest in skf:
                for i in range(0, len(ktrain), self.batch_size):
                    batch_sample = []
                    batch_label = []
                    # create a batch of training samples
                    for j in range(i, min(len(ktrain), i + self.batch_size)):
                        #print training[ktrain[j]]
                        batch_sample.append(training[ktrain[j]])
                        batch_label.append(output[ktrain[j]])

                    # update and get the cost
                    #print batch_sample
                    #print self.get_output(batch_sample)
                    #print batch_label
                    epoch += self.train_network(batch_sample, batch_label)

                    n_train += 1

                sample = []
                label = []
                for i in range(0, len(ktest)):
                    sample.append(training[ktest[i]])
                    label.append(output[ktest[i]])
                n_test += 1
                eval += self.test_network(sample, label)

            train_err.append(epoch / n_train)
            val_err.append(eval / n_test)
            print "{:<11}".format("Epoch " + str(e + 1) + ":"), \
                "{:<9}".format("{0:.4f}".format(epoch / n_train)), \
                "{:<9}".format("{0:.4f}".format(eval / n_test)), \
                "{:<9}".format("{0:.1f}s".format(time.clock() - epoch_time))
        print "Total Training Time:", "{0:.1f}s".format(time.clock() - start_time)
Esempio n. 28
0
 def __init__(self, name, difficulty=0.5, difficulty_std=0.1):
     self.problems = []
     self.name = name
     for i in range(0, 10000):
         self.problems.append(
             du.clamp(np.random.normal(difficulty, difficulty_std), 0, 1))
Esempio n. 29
0
 def get_hint_effect(self):
     return du.MAX(np.random.normal(self.hint_effect, self.hint_effect_std),
                   0)
Esempio n. 30
0
 def next_problem(self):
     return self.problems[du.rand(0, len(self.problems))]
Esempio n. 31
0
 def get_knowledge_effect(self):
     return du.MAX(
         np.random.normal(self.knowledge_effect, self.knowledge_effect_std),
         0)
Esempio n. 32
0
            "cr": .8,
            "hidden_layer": [6, 8]
        }
    }
    ##############################################
    # START MULTIPROCESS JOB POOL
    ##############################################
    manager = multiprocessing.Manager()
    q = manager.Queue()
    writer = multiprocessing.Process(target=data_writer, args=(q, filename))
    writer.start()

    pool = multiprocessing.Pool()
    ##############################################

    du = DataUtility.DataUtility(categorical_attribute_indices,
                                 regression_data_set)
    total_counter = 0
    for data_set in data_sets:

        regression = regression_data_set[data_set]
        tuned_parameters = [
            tuned_0_hl[data_set], tuned_1_hl[data_set], tuned_2_hl[data_set]
        ]

        data_set_counter = 0
        # ten fold data and labels is a list of [data, labels] pairs, where
        # data and labels are numpy arrays:
        tenfold_data_and_labels = du.Dataset_and_Labels(data_set)

        for j in range(10):
            data_package = generate_data_package(
Esempio n. 33
0
 def get_speed_effect(self):
     return du.MAX(
         np.random.normal(self.speed_effect, self.speed_effect_std), 0)
Esempio n. 34
0
                if len(item_both_rated) == 0:
                    continue
                # 获取用户 a 和用户 b 都评价的商品的评价值
                user_a_ = [user_a[i] for i in item_both_rated]
                user_b_ = [user_b[i] for i in item_both_rated]
                # 根据用户 a 和用户 b 都评价的商品的评价值,计算Pearson 相关系数
                sim = np.corrcoef(user_a_, user_b_)[0, 1]
                # 如果相关系数大于阈值,那么加入到评分预测值计算中
                if sim >= self.limit:
                    predict_up += sim * (user_b[testItem] - np.mean(user_b_))
                    predict_down += sim
        return 1 if predict_down == 0 else predict_up / predict_down + np.sum(
            user_a) / np.count_nonzero(user_a)


df_train = DataUtility.getTrainData()
# df_train = df_train.iloc[0:int(df_train.shape[0] / 10), 0:int(df_train.shape[1])]
# df_test = DataUtility.getTestData()
CF = collaborativeFiltering(df_train, 0.5)
test_data = []
user = list(df_train.index)
item = list(df_train.columns)
for i in range(int(df_train.shape[0] / 10)):
    for j in range(df_train.shape[1]):
        if df_train.iloc[i, j] == 0:
            continue
        else:
            test_data.append([user[i], item[j], df_train.iloc[i, j]])
            break
test_data = [[t[0], t[1], t[2]] for t in test_data if t[2] != 0]
p = CF.predict(test_data)
Esempio n. 35
0
        self.hint_effect_std = hint_std

        SkillLink.list.append(self)

    def get_knowledge_effect(self):
        return du.MAX(np.random.normal(self.knowledge_effect, self.knowledge_effect_std), 0)

    def get_speed_effect(self):
        return du.MAX(np.random.normal(self.speed_effect, self.speed_effect_std), 0)

    def get_hint_effect(self):
        return du.MAX(np.random.normal(self.hint_effect, self.hint_effect_std), 0)


if __name__ == "__main__":
    data, headers = du.loadCSVwithHeaders('filtered_data.csv')

    students = []
    num_students = 1000

    print "Generating data for", num_students, "students..."

    for i in range(0,num_students):
        index = du.rand(0,len(data))
        #self, hint, speed, knowledge, h_sd, s_sd, k_sd):
        students.append(Student(data[index][7],
                                data[index][1],
                                data[index][4],
                                data[index][8],
                                data[index][2],
                                data[index][5]))