def get_result(self, problem, pr_number, skillName, use_hints=True): effects = self.get_prereq_effects(skillName) problem = 1-problem knowledge_p = self.knowledge + effects[0] # knowledge_p = problem + effects[0] speed_p = self.speed + effects[1] hint_p = self.hint + effects[2] # print [knowledge_p, self.knowledge_std] answer = du.clamp(np.random.normal(knowledge_p, self.knowledge_std), 0, 1) pr_hint = hint_p / pr_number hint = int(du.diceRoll(1000) < (pr_hint*1000)) cor = 0 if answer > problem: answer = du.clamp(np.random.normal(0.9, self.knowledge_std), 0, 1) else: answer *= ((0.3-(problem-0.3))/0.3) cor = int(du.diceRoll(1000) < answer*1000) * (1-(hint*int(use_hints))) time = du.clamp(np.random.normal(speed_p, self.speed_std), 0, 10000) * problem time += du.MAX(0, np.random.normal(Student.hint_time_offset, Student.hint_time_offset_std)) * hint return [cor, time, hint]
def generate_data_package(fold: int, tenfolds: list, regression: bool, du: DataUtility): test_data, test_labels = copy.deepcopy(tenfolds[fold]) remaining_data = [ x[0] for i, x in enumerate(copy.deepcopy(tenfolds)) if i != fold ] remaining_labels = [ y[1] for i, y in enumerate(copy.deepcopy(tenfolds)) if i != fold ] #Store off a set of the remaining dataset training_data = np.concatenate(remaining_data, axis=1) #Store the remaining data set labels training_labels = np.concatenate(remaining_labels, axis=1) if regression == True: #The number of output nodes is 1 output_size = 1 #else it is a classification data set else: #Count the number of classes in the label data set output_size = du.CountClasses(training_labels) #Get the test data labels in one hot encoding test_labels = du.ConvertLabels(test_labels, output_size) #Get the Labels into a One hot encoding training_labels = du.ConvertLabels(training_labels, output_size) input_size = training_data.shape[0] return [ test_data, test_labels, training_data, training_labels, output_size, input_size ]
def compress_json_files(): print("Compressing JSON-files") for data_set_type in [DataSetType.TRAINING, DataSetType.TEST, DataSetType.RECORDED]: path = DataUtility.get_data_set_path(DataSetFormat.RAW, data_set_type) raw_filelist = DataUtility.generate_file_list(path) for file in raw_filelist: if Utility.is_file_already_compressed(file, data_set_type): continue Utility.compress_json_file(file, data_set_type) print("Finshed compressing!")
def split_for_autoencoding(samples): ini_input = [] ini_output = [] rem_input = [] rem_output = [] for samp in samples: ini_input.append(samp[0]) ini_output.append(samp[1]) rem_input.append(samp[1]) rem_output.append(samp[2]) return du.convert_to_floats(ini_input), du.convert_to_floats(ini_output),\ du.convert_to_floats(rem_input), du.convert_to_floats(rem_output)
def compress_json_file(file, data_set_type): print("Compressing file: " + file.filename) raw_data = get_json_data_from_file(file) compressed_data = {} json_array_name_list = [ Constant.JSON_EMG_ARRAY_NAME, Constant.JSON_ACC_ARRAY_NAME, Constant.JSON_GYR_ARRAY_NAME, Constant.JSON_ORI_ARRAY_NAME ] data_length_list = [ Constant.DATA_LENGTH_EMG, Constant.DATA_LENGTH_ACC, Constant.DATA_LENGTH_GYR, Constant.DATA_LENGTH_ORI ] for json_array_name, data_length in zip(json_array_name_list, data_length_list): compressed_data[json_array_name] = {} # if file.is_recorded: # transposed_raw_data = numpy.transpose(raw_data[json_array_name][Constant.JSON_ARRAY_DATA_TABLE_NAME][:data_length]).tolist() # else: # transposed_raw_data = raw_data[json_array_name][Constant.JSON_ARRAY_DATA_TABLE_NAME][:data_length] transposed_raw_data = raw_data[json_array_name][ Constant.JSON_ARRAY_DATA_TABLE_NAME][:data_length] compressed_data[json_array_name][ Constant.JSON_ARRAY_DATA_TABLE_NAME] = transposed_raw_data compressed_file_path = DataUtility.get_data_set_path( DataSetFormat.COMPRESSED, data_set_type) + file.filename with open(compressed_file_path, 'w') as outfile: json.dump(compressed_data, outfile)
def load_unlabeled_data(filename, primary_column, secondary_column, covariate_columns, load_from_file=False): # load from file or rebuild dataset load = load_from_file data = None if not load: data, headers = du.loadCSVwithHeaders(filename) for i in range(0, len(headers)): print '{:>2}: {:<18} {:<12}'.format(str(i), headers[i], data[0][i]) else: print 'Skipping dataset loading - using cached data instead' print '\ntransforming data to time series...' pdata, labels, grouping = RNN.build_sequences(data, primary_column, secondary_column, covariate_columns, [1, 2]) print '\nDataset Info:' print 'number of samples:', len(pdata) print 'sequence length of first sample:', len(pdata[0]) print 'input nodes: ', len(pdata[0][0]) return pdata, labels, grouping
def get_label_distribution(labels): flat_labels = RNN.flatten_sequence(labels) labels = du.transpose(flat_labels) dist = [] for i in range(0, len(labels)): dist.append( (float(np.nansum(np.array(labels[i]))) / len(labels[i]))) return dist
def create_json_file(self, filename): print("Creating file:", filename) json_data = {} for sensor in range(Sensor.NUMBER_OF_SENSORS): json_array_name = Utility.get_json_array_name_for_sensor(sensor) json_data_table_name = Constant.JSON_ARRAY_DATA_TABLE_NAME json_data[json_array_name] = {} json_data[json_array_name][ json_data_table_name] = self.get_sensor_data(sensor) folder_path = DataUtility.get_data_set_path(DataSetFormat.RAW, DataSetType.RECORDED) with open(folder_path + filename, 'w') as outfile: json.dump(json_data, outfile) o_file = DataUtility.File(folder_path, filename, None) return o_file
def get_prereq_effects(self, skillName): knowledge_effect = 0 speed_effect = 0 hint_effect = 0 for sk in SkillLink.list: if sk.postreq == skillName and du.exists(sk.prereq,self.completed_assignments): knowledge_effect += sk.get_knowledge_effect() speed_effect += sk.get_speed_effect() hint_effect += sk.get_hint_effect() return [knowledge_effect, speed_effect, hint_effect]
def get_result(self, problem, pr_number, skillName, use_hints=True): effects = self.get_prereq_effects(skillName) problem = 1 - problem knowledge_p = self.knowledge + effects[0] # knowledge_p = problem + effects[0] speed_p = self.speed + effects[1] hint_p = self.hint + effects[2] # print [knowledge_p, self.knowledge_std] answer = du.clamp(np.random.normal(knowledge_p, self.knowledge_std), 0, 1) pr_hint = hint_p / pr_number hint = int(du.diceRoll(1000) < (pr_hint * 1000)) cor = 0 if answer > problem: answer = du.clamp(np.random.normal(0.9, self.knowledge_std), 0, 1) else: answer *= ((0.3 - (problem - 0.3)) / 0.3) cor = int( du.diceRoll(1000) < answer * 1000) * (1 - (hint * int(use_hints))) time = du.clamp(np.random.normal(speed_p, self.speed_std), 0, 10000) * problem time += du.MAX( 0, np.random.normal(Student.hint_time_offset, Student.hint_time_offset_std)) * hint return [cor, time, hint]
def get_prereq_effects(self, skillName): knowledge_effect = 0 speed_effect = 0 hint_effect = 0 for sk in SkillLink.list: if sk.postreq == skillName and du.exists( sk.prereq, self.completed_assignments): knowledge_effect += sk.get_knowledge_effect() speed_effect += sk.get_speed_effect() hint_effect += sk.get_hint_effect() return [knowledge_effect, speed_effect, hint_effect]
def generate_data_package(fold: int, tenfolds: list, regression: bool, du: DataUtility): # get the fold we are going to use for testing test_data, test_labels = copy.deepcopy(tenfolds[fold]) # squish the rest of the data and ground truth labels into one numpy array, respectively remaining_data = [x[0] for i, x in enumerate(copy.deepcopy(tenfolds)) if i!=fold] remaining_labels = [y[1] for i, y in enumerate(copy.deepcopy(tenfolds)) if i!=fold] training_data = np.concatenate(remaining_data, axis=1) training_labels = np.concatenate(remaining_labels, axis=1) # determine how many output nodes the network has (1 if regression) if regression == True: #The number of output nodes is 1 output_size = 1 #else it is a classification data set else: #Count the number of classes in the label data set output_size = du.CountClasses(training_labels) #Get the test data labels in one hot encoding test_labels = du.ConvertLabels(test_labels, output_size) #Get the Labels into a One hot encoding training_labels = du.ConvertLabels(training_labels, output_size) input_size = training_data.shape[0] return [test_data, test_labels, training_data, training_labels, output_size, input_size]
def print_label_distribution(labels, label_names=None): print "\nLabel Distribution:" flat_labels = RNN.flatten_sequence(labels) labels = du.transpose(flat_labels) if label_names is not None: assert len(label_names) == len(labels) else: label_names = [] for i in range(0, len(labels)): label_names[i] = "Label_" + str(i) for i in range(0, len(labels)): print " " + label_names[i] + ":", "{:<6}".format(np.nansum(np.array(labels[i]))), \ "({0:.0f}%)".format((float(np.nansum(np.array(labels[i]))) / len(labels[i])) * 100)
def add_representation(data,labels,label_column,duplicate=10,threshold=0.0): assert len(data) == len(labels) print "Adding Representation to label:",label_column ndata = [] nlabel = [] for i in range(0,len(data)): represent = 1 if labels[i] is list: if np.nanmean(labels[i], 0)[label_column] > threshold: represent = duplicate else: if labels[i][label_column] > threshold: represent = duplicate for j in range(0,represent): ndata.append(data[i]) nlabel.append(labels[i]) ndata,nlabel = du.shuffle(ndata,nlabel) return np.array(ndata),np.array(nlabel)
def test(self, samples, test_labels,label_names=None): # test each using held-out data test = samples # if test_labels is None: # return self.predict(test_samples) label_test = test_labels print("\nTesting...") print "Test Samples:", len(test) classes = [] p_count = 0 avg_class_err = [] avg_err = self.test_network(test, label_test) predictions = self.predict_network(test) for i in range(0, len(label_test)): p_count += 1 classes.append(label_test[i].tolist()) predictions = np.round(predictions, 3).tolist() actual = [] pred = [] cor = [] # get the percent correct for the predictions # how often the prediction is right when it is made for i in range(0, len(predictions)): c = classes[i].index(max(classes[i])) actual.append(c) p = predictions[i].index(max(predictions[i])) pred.append(p) cor.append(int(c == p)) # calculate a naive unfair baseline using averages avg_class_pred = np.mean(label_test, 0) print "Predicting:", avg_class_pred, "for baseline*" for i in range(0, len(label_test)): res = FFNNet.AverageCrossEntropy(np.array(avg_class_pred), np.array(classes[i])) avg_class_err.append(res) # res = RNN_GRU.AverageCrossEntropy(np.array(predictions_GRU[i]), np.array(classes[i])) # avg_err_GRU.append(res) print "*This is calculated from the TEST labels" from sklearn.metrics import roc_auc_score, f1_score from skll.metrics import kappa kpa = [] auc = [] f1s = [] t_pred = du.transpose(predictions) t_lab = du.transpose(label_test) for i in range(0, len(t_lab)): # if i == 0 or i == 3: # t_pred[i] = du.normalize(t_pred[i],method='max') kpa.append(kappa(t_lab[i], t_pred[i])) auc.append(roc_auc_score(t_lab[i], t_pred[i])) temp_p = [round(j) for j in t_pred[i]] if np.nanmax(temp_p) == 0: f1s.append(0) else: f1s.append(f1_score(t_lab[i], temp_p)) print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_class_err)) print "\nNetwork Performance:" print "Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_err)) print "AUC:", "{0:.4f}".format(np.nanmean(auc)) print "Kappa:", "{0:.4f}".format(np.nanmean(kpa)) print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s)) print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor) * 100) print "\n{:<15}".format(" Label"), \ "{:<9}".format(" AUC"), \ "{:<9}".format(" Kappa"), \ "{:<9}".format(" F Stat"), \ "\n==============================================" if label_names is None or len(label_names) != len(t_lab): label_names = [] for i in range(0, len(t_lab)): label_names.append("Label " + str(i + 1)) for i in range(0, len(t_lab)): print "{:<15}".format(label_names[i]), \ "{:<9}".format(" {0:.4f}".format(auc[i])), \ "{:<9}".format(" {0:.4f}".format(kpa[i])), \ "{:<9}".format(" {0:.4f}".format(f1s[i])) print "\n==============================================" actual = [] predicted = [] for i in range(0, len(predictions)): actual.append(label_test[i].tolist().index(max(label_test[i]))) predicted.append(predictions[i].index(max(predictions[i]))) from sklearn.metrics import confusion_matrix print confusion_matrix(actual, predicted) return predictions
def __init__(self, name, difficulty=0.5, difficulty_std=0.1): self.problems = [] self.name = name for i in range(0,10000): self.problems.append(du.clamp(np.random.normal(difficulty, difficulty_std), 0, 1))
def next_problem(self): return self.problems[du.rand(0, len(self.problems))]
return du.convert_to_floats(ini_input), du.convert_to_floats(ini_output),\ du.convert_to_floats(rem_input), du.convert_to_floats(rem_output) if __name__ == "__main__": # load training and test data training = [] tr_label = [] testing = [] test_label = [] samples,labels = load_skill_data('simulated_data.csv','simulated_hierarchy.csv','simulated_hierarchy_nonlink.csv') tr_samples, t_samples, tr_labels,t_labels = du.split_training_test(samples,labels) t_tr_labels = du.transpose(tr_labels) import math pre_rep = int(math.floor((len(t_tr_labels[0]) / np.nansum(t_tr_labels[0])) + 1)) non_rep = int(math.floor((len(t_tr_labels[1]) / np.nansum(t_tr_labels[1])) + 1)) rev_rep = int(math.floor((len(t_tr_labels[2]) / np.nansum(t_tr_labels[2])) + 1)) print pre_rep, non_rep, rev_rep re_tr_samples, re_tr_labels = add_representation(tr_samples, tr_labels, 0, pre_rep) re_tr_samples, re_tr_labels = add_representation(re_tr_samples, re_tr_labels, 1, non_rep) re_tr_samples, re_tr_labels = add_representation(re_tr_samples, re_tr_labels, 2, rev_rep) re_tr_samples, re_tr_labels = du.sample(re_tr_samples,re_tr_labels,p=0.2)
def main(): print("Program Start") headers = [ "Data set", "layers", "pop", "Beta", "CR", "generations", "loss1", "loss2" ] filename = 'DE_experimental_resultsFINAL.csv' Per = Performance.Results() Per.PipeToFile([], headers, filename) data_sets = [ "soybean", "glass", "abalone", "Cancer", "forestfires", "machine" ] regression_data_set = { "soybean": False, "Cancer": False, "glass": False, "forestfires": True, "machine": True, "abalone": True } categorical_attribute_indices = { "soybean": [], "Cancer": [], "glass": [], "forestfires": [], "machine": [], "abalone": [] } tuned_0_hl = { "soybean": { "omega": .5, "c1": .1, "c2": 5, "hidden_layer": [] }, "Cancer": { "omega": .5, "c1": .5, "c2": 5, "hidden_layer": [] }, "glass": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [] }, "forestfires": { "omega": .2, "c1": 5, "c2": .5, "hidden_layer": [] }, "machine": { "omega": .5, "c1": .9, "c2": 5, "hidden_layer": [] }, "abalone": { "omega": .2, "c1": 5, "c2": .9, "hidden_layer": [] } } tuned_1_hl = { "soybean": { "omega": .5, "c1": .5, "c2": 1, "hidden_layer": [7] }, "Cancer": { "omega": .2, "c1": .5, "c2": 5, "hidden_layer": [4] }, "glass": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [8] }, "forestfires": { "omega": .2, "c1": 5, "c2": 5, "hidden_layer": [8] }, "machine": { "omega": .5, "c1": 5, "c2": .5, "hidden_layer": [4] }, "abalone": { "omega": .2, "c1": .1, "c2": 5, "hidden_layer": [8] } } tuned_2_hl = { "soybean": { "omega": .5, "c1": .9, "c2": .1, "hidden_layer": [7, 12] }, "Cancer": { "omega": .2, "c1": .5, "c2": 5, "hidden_layer": [4, 4] }, "glass": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [8, 6] }, "forestfires": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [8, 8] }, "machine": { "omega": .2, "c1": .9, "c2": .1, "hidden_layer": [7, 2] }, "abalone": { "omega": .2, "c1": 5, "c2": 5, "hidden_layer": [6, 8] } } du = DataUtility.DataUtility(categorical_attribute_indices, regression_data_set) total_counter = 0 for data_set in data_sets: data_set_counter = 0 # ten fold data and labels is a list of [data, labels] pairs, where # data and labels are numpy arrays: tenfold_data_and_labels = du.Dataset_and_Labels(data_set) for j in range(10): test_data, test_labels = copy.deepcopy(tenfold_data_and_labels[j]) #Append all data folds to the training data set remaining_data = [ x[0] for i, x in enumerate(tenfold_data_and_labels) if i != j ] remaining_labels = [ y[1] for i, y in enumerate(tenfold_data_and_labels) if i != j ] #Store off a set of the remaining dataset X = np.concatenate(remaining_data, axis=1) #Store the remaining data set labels labels = np.concatenate(remaining_labels, axis=1) print(data_set, "training data prepared") regression = regression_data_set[data_set] #If the data set is a regression dataset if regression == True: #The number of output nodes is 1 output_size = 1 #else it is a classification data set else: #Count the number of classes in the label data set output_size = du.CountClasses(labels) #Get the test data labels in one hot encoding test_labels = du.ConvertLabels(test_labels, output_size) #Get the Labels into a One hot encoding labels = du.ConvertLabels(labels, output_size) input_size = X.shape[0] data_set_size = X.shape[1] + test_data.shape[1] tuned_parameters = [ tuned_0_hl[data_set], tuned_1_hl[data_set], tuned_2_hl[data_set] ] for z in range(3): hidden_layers = tuned_parameters[z]["hidden_layer"] layers = [input_size] + hidden_layers + [output_size] nn = NeuralNetwork(input_size, hidden_layers, regression, output_size) nn.set_input_data(X, labels) total_weights = 0 for i in range(len(layers) - 1): total_weights += layers[i] * layers[i + 1] hyperparameters = { "population_size": 10 * total_weights, "beta": .5, "crossover_rate": .6, "max_gen": 100 } hyperparameterss = { "maxGen": 100, "pop_size": 100, "mutation_rate": .5, "mutation_range": 10, "crossover_rate": .5 } hyperparametersss = { "position_range": 10, "velocity_range": 1, "omega": .1, # tuned_parameters[z]["omega"], "c1": .9, # tuned_parameters[z]["c1"], "c2": .1, # tuned_parameters[z]["c2"], "vmax": 1, "pop_size": 1000, "max_t": 50 } de = DE.DE(hyperparameters, total_weights, nn) ga = GA.GA(hyperparameterss, total_weights, nn) pso = PSO.PSO(layers, hyperparametersss, nn) learning_rate = 3 momentum = 0 VNN = VideoNN.NeuralNetworks(input_size, hidden_layers, regression, output_size, learning_rate, momentum) counter = 0 print("DE OPERATIONS ") for gen in range(de.maxgens): if counter == 1: break print("MUTATE AND CROSS OVER ") de.Pmutate_and_crossover() counter = counter + 1 time.sleep(200) counter = 0 print("GA OPERATIONS") for gen in range(ga.maxGen): if counter == 1: break print() ga.pfitness() ga.Pselection() ga.Pcrossover() counter = counter + 1 time.sleep(200) counter = 0 print("PSO OPERATIONS") for epoch in range(pso.max_t): if counter == 1: break pso.Pupdate_fitness() pso.Pupdate_position_and_velocity() counter = counter + 1 time.sleep(200) # plt.plot(list(range(len(de.globalbest))), de.globalbest) # plt.draw() # plt.pause(0.00001) #plt.clf() # get the best overall solution and set the NN to those weights #DE bestSolution = de.bestChromie.getchromie() bestWeights = de.nn.weight_transform(bestSolution) de.nn.weights = bestWeights #GA #PS # ################################ new code for de end ################################### # plt.ioff() # plt.plot(list(range(len(de.globalbest))), de.globalbest) # plt.show() # img_name = data_set + '_l' + str(len(hidden_layers)) + '_pr' + str(a) + '_vr' + str(b) + '_w' + str(c) + '_c' + str(d) + '_cc' + str(e) + '_v' + str(f) + '_ps' + str(g) + '.png' # plt.savefig('tuning_plots/' + img_name) # plt.clf() Estimation_Values = de.nn.classify(test_data, test_labels) if regression == False: #Decode the One Hot encoding Value Estimation_Values = de.nn.PickLargest(Estimation_Values) test_labels_list = de.nn.PickLargest(test_labels) # print("ESTiMATION VALUES BY GIVEN INDEX (CLASS GUESS) ") # print(Estimation_Values) else: Estimation_Values = Estimation_Values.tolist() test_labels_list = test_labels.tolist()[0] Estimation_Values = Estimation_Values[0] Estimat = Estimation_Values groun = test_labels_list Nice = Per.ConvertResultsDataStructure(groun, Estimat) # print("THE GROUND VERSUS ESTIMATION:") # print(Nice) # headers = ["Data set", "layers", "pop", "Beta", "CR", "generations", "loss1", "loss2"] Meta = [ data_set, len(hidden_layers), hyperparameters["population_size"], hyperparameters["beta"], hyperparameters["crossover_rate"], hyperparameters["max_gen"] ] Per.StartLossFunction(regression, Nice, Meta, filename) print(f"{data_set_counter}/30 {data_set}. {total_counter}/180") data_set_counter += 1 total_counter += 1 print("DEMO FINISHED") time.sleep(10000) print("Program End ")
def is_file_already_compressed(file, data_set_type): compressed_file_path = DataUtility.get_data_set_path( DataSetFormat.COMPRESSED, data_set_type) + file.filename return os.path.exists(compressed_file_path)
def test(self, test, test_labels=None, label_names=None): if test_labels is None: return self.predict(test) test_cpy = list(test) if not du.len_deepest(test_cpy) == self.num_input: if self.covariates is not None: for a in range(0, len(test_cpy)): if type(test_cpy[a]) is not list: test_cpy[a] = test_cpy[a].tolist() for e in range(0, len(test_cpy[a])): c = [] for i in range(0, len(self.covariates)): c.append(test_cpy[a][e][self.covariates[i]]) test_cpy[a][e] = c if len(self.cov_mean) == 0 or len(self.cov_stdev) == 0: print "Scaling factors have not been generated: calculating using test sample" t_tr = du.transpose(RNN.flatten_sequence(test_cpy)) self.cov_mean = [] self.cov_stdev = [] for a in range(0, len(t_tr)): mn = np.nanmean(t_tr[a]) sd = np.nanstd(t_tr[a]) self.cov_mean.append(mn) self.cov_stdev.append(sd) test_samples = [] import math for a in range(0, len(test_cpy)): sample = [] for e in range(0, len(test_cpy[a])): covariates = [] for i in range(0, len(test_cpy[a][e])): cov = 0 if self.cov_stdev[i] == 0: cov = 0 else: cov = (test_cpy[a][e][i] - self.cov_mean[i]) / self.cov_stdev[i] if math.isnan(cov) or math.isinf(cov): cov = 0 covariates.append(cov) sample.append(covariates) test_samples.append(sample) label_test = test_labels print("\nTesting...") print "Test Samples:", len(test_samples) classes = [] p_count = 0 avg_class_err = [] avg_err_RNN = [] if self.scale_output: print "Scaling output..." predictions_RNN = [] for i in range(0, len(test_samples)): # get the prediction and calculate cost prediction_RNN = self.pred_RNN([test_samples[i]]) #prediction_RNN += .5-self.avg_preds if self.scale_output: prediction_RNN -= self.min_preds prediction_RNN /= (self.max_preds - self.min_preds) prediction_RNN = np.clip(prediction_RNN, 0, 1) prediction_RNN = [(x * [ 1 if c == self.majorityclass else 0.9999 for c in range(0, self.num_output) ]) if np.sum(x) == 4 else x for x in prediction_RNN] avg_err_RNN.append( self.compute_cost_RNN([test_samples[i]], label_test[i])) for j in range(0, len(label_test[i])): p_count += 1 classes.append(label_test[i][j].tolist()) predictions_RNN.append(prediction_RNN[j].tolist()) predictions_RNN = np.round(predictions_RNN, 3).tolist() actual = [] pred_RNN = [] cor_RNN = [] # get the percent correct for the predictions # how often the prediction is right when it is made for i in range(0, len(predictions_RNN)): c = classes[i].index(max(classes[i])) actual.append(c) p_RNN = predictions_RNN[i].index(max(predictions_RNN[i])) pred_RNN.append(p_RNN) cor_RNN.append(int(c == p_RNN)) # calculate a naive baseline using averages flattened_label = [] for i in range(0, len(label_test)): for j in range(0, len(label_test[i])): flattened_label.append(label_test[i][j]) flattened_label = np.array(flattened_label) avg_class_pred = np.mean(flattened_label, 0) print "Predicting:", avg_class_pred, "for baseline*" for i in range(0, len(flattened_label)): res = RNN.AverageCrossEntropy(np.array(avg_class_pred), np.array(classes[i])) avg_class_err.append(res) # res = RNN.AverageCrossEntropy(np.array(predictions_RNN[i]), np.array(classes[i])) # avg_err_RNN.append(res) print "*This is calculated from the TEST labels" from sklearn.metrics import roc_auc_score, f1_score from skll.metrics import kappa kpa = [] auc = [] f1s = [] apr = [] t_pred = du.transpose(predictions_RNN) t_lab = du.transpose(flattened_label) for i in range(0, len(t_lab)): #if i == 0 or i == 3: # t_pred[i] = du.normalize(t_pred[i],method='max') temp_p = [round(j) for j in t_pred[i]] kpa.append(kappa(t_lab[i], t_pred[i])) apr.append(du.Aprime(t_lab[i], t_pred[i])) auc.append(roc_auc_score(t_lab[i], t_pred[i])) if np.nanmax(temp_p) == 0: f1s.append(0) else: f1s.append(f1_score(t_lab[i], temp_p)) if label_names is None or len(label_names) != len(t_lab): label_names = [] for i in range(0, len(t_lab)): label_names.append("Label " + str(i + 1)) RNN.print_label_distribution(label_test, label_names) self.eval_metrics = [ np.nanmean(avg_err_RNN), np.nanmean(auc), np.nanmean(kpa), np.nanmean(f1s), np.nanmean(cor_RNN) * 100 ] print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format( np.nanmean(avg_class_err)) print "\nNetwork Performance:" print "Average Cross-Entropy:", "{0:.4f}".format( np.nanmean(avg_err_RNN)) print "AUC:", "{0:.4f}".format(np.nanmean(auc)) print "A':", "{0:.4f}".format(np.nanmean(apr)) print "Kappa:", "{0:.4f}".format(np.nanmean(kpa)) print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s)) print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor_RNN) * 100) print "\n{:<15}".format(" Label"), \ "{:<9}".format(" AUC"), \ "{:<9}".format(" A'"), \ "{:<9}".format(" Kappa"), \ "{:<9}".format(" F Stat"), \ "\n==============================================" for i in range(0, len(t_lab)): print "{:<15}".format(label_names[i]), \ "{:<9}".format(" {0:.4f}".format(auc[i])), \ "{:<9}".format(" {0:.4f}".format(apr[i])), \ "{:<9}".format(" {0:.4f}".format(kpa[i])), \ "{:<9}".format(" {0:.4f}".format(f1s[i])) print "\n==============================================" print "Confusion Matrix:" actual = [] predicted = [] flattened_label = flattened_label.tolist() for i in range(0, len(predictions_RNN)): actual.append(flattened_label[i].index(max(flattened_label[i]))) predicted.append(predictions_RNN[i].index(max(predictions_RNN[i]))) from sklearn.metrics import confusion_matrix conf_mat = confusion_matrix(actual, predicted) for cm in conf_mat: cm_row = "\t" for element in cm: cm_row += "{:<6}".format(element) print cm_row print "\n==============================================" return predictions_RNN
def predict(self, test): test_cpy = list(test) if not du.len_deepest(test_cpy) == self.num_input: if self.covariates is not None: for a in range(0, len(test_cpy)): if type(test_cpy[a]) is not list: test_cpy[a] = test_cpy[a].tolist() for e in range(0, len(test[a])): c = [] for i in range(0, len(self.covariates)): c.append(test_cpy[a][e][self.covariates[i]]) test_cpy[a][e] = c if len(self.cov_mean) == 0 or len(self.cov_stdev) == 0: print "Scaling factors have not been generated: calculating using test sample" t_tr = du.transpose(RNN.flatten_sequence(test_cpy)) self.cov_mean = [] self.cov_stdev = [] for a in range(0, len(t_tr)): mn = np.nanmean(t_tr[a]) sd = np.nanstd(t_tr[a]) self.cov_mean.append(mn) self.cov_stdev.append(sd) test_samples = [] import math for a in range(0, len(test_cpy)): sample = [] for e in range(0, len(test_cpy[a])): covariates = [] for i in range(0, len(test_cpy[a][e])): cov = 0 if self.cov_stdev[i] == 0: cov = 0 else: cov = (test_cpy[a][e][i] - self.cov_mean[i]) / self.cov_stdev[i] if math.isnan(cov) or math.isinf(cov): cov = 0 covariates.append(cov) sample.append(covariates) test_samples.append(sample) if self.scale_output: print "Scaling output..." predictions_RNN = [] for i in range(0, len(test_samples)): # get the prediction and calculate cost prediction_RNN = self.pred_RNN([test_samples[i]]) if self.scale_output: prediction_RNN -= self.min_preds prediction_RNN /= (self.max_preds - self.min_preds) prediction_RNN = np.clip(prediction_RNN, 0, 1) prediction_RNN = [(x * [ 1 if c == self.majorityclass else 0.9999 for c in range(0, self.num_output) ]) if np.sum(x) == 4 else x for x in prediction_RNN] for j in range(0, len(prediction_RNN)): predictions_RNN.append(prediction_RNN[j].tolist()) predictions_RNN = np.round(predictions_RNN, 3).tolist() return predictions_RNN
def run(): # Obtain current date current_date = datetime.date.today().strftime('%Y-%m-%d') # Get top level directory top_level_directory = get_top_level_directory_path() # Load configuration files for program and Twitter authentication main_config = cu.Config(top_level_directory + '/src/twitter_updates/TwitterUpdateConfig.dat') auth_config = cu.Config(top_level_directory + main_config.get_value('TwitterAuth')) # Remove any old files from /res/raw_images clean_dir(main_config.get_value('RawImages')) # Authenticate Twitter API session twitter_session = TwitterAPISession(auth_config) # Store values read by OCR algorithm in a dictionary input_data = {\ 'Date' : current_date, 'Cases' : 0, 'Deaths' : 0, 'Tests' : 0, 'Recovered' : 0, 'Hospitalized' : 0, 'Cases24H' : 0 } # Remove any old files from /res/raw_images clean_dir(main_config.get_value('RawImages')) # Open temporary command line to check if data is correct check_data_menu(input_data) # Load simple Peru data set PER_data = du.Table('l', filename=top_level_directory + main_config.get_value('PeruSimpleData')) # Agregate new data entry PER_data.append_entry({ 'Fecha': input_data['Date'], 'Casos': int(input_data['Cases']), 'Fallecidos': int(input_data['Deaths']), 'Pruebas': int(input_data['Tests']), 'Recuperados': int(input_data['Recovered']), 'Hospitalizados': int(input_data['Hospitalized']) }) # Save simple Peru data set PER_data.save_as_csv(top_level_directory + main_config.get_value('PeruSimpleData')) # Create copy of simple Peru data set to perform extrapolation PER_full_data = du.Table('c', table=PER_data) # Compute new derived statistics PER_full_data.compute_add_column(['Casos'], compute_new_cases, 'NuevosCasos') PER_full_data.compute_add_column(['Casos'], compute_cases_growth_factor, '%DifCasos') PER_full_data.compute_add_column(['Casos', 'Recuperados', 'Fallecidos'], compute_active_cases, 'CasosActivos') PER_full_data.compute_add_column(['CasosActivos'], compute_new_active_cases, 'NuevosCasosActivos') PER_full_data.compute_add_column(['Fallecidos'], compute_new_deaths, 'NuevosFallecidos') PER_full_data.compute_add_column(['Fallecidos'], compute_deaths_growth_factor, '%DifFallecidos') PER_full_data.compute_add_column(['Casos', 'Fallecidos'], compute_case_fatality_rate, 'TasaLetalidad') PER_full_data.compute_add_column(['Pruebas'], compute_new_tests, 'NuevasPruebas') PER_full_data.compute_add_column(['Pruebas'], compute_tests_growth_factor, '%DifPruebas') PER_full_data.compute_add_column(['NuevasPruebas', 'NuevosCasos'], compute_daily_positivity_rate, '%PruebasPositivasDiarias') PER_full_data.compute_add_column(['Recuperados'], compute_new_recovered, 'NuevosRecuperados') PER_full_data.compute_add_column(['Recuperados'], compute_tests_growth_factor, '%DifRecuperados') PER_full_data.compute_add_column(['Hospitalizados'], compute_new_hospitalized, 'NuevosHospitalizados') PER_full_data.compute_add_column(['Hospitalizados'], compute_hospitalized_growth_factor, '%DifHospitalizados') PER_full_data.compute_add_column([], compute_days, 'Dia') # Reorganize header index before saving new_header = { 0: 'Fecha', 1: 'Dia', 2: 'Casos', 3: 'NuevosCasos', 4: '%DifCasos', 5: 'CasosActivos', 6: 'NuevosCasosActivos', 7: 'Fallecidos', 8: 'NuevosFallecidos', 9: '%DifFallecidos', 10: 'TasaLetalidad', 11: 'Pruebas', 12: 'NuevasPruebas', 13: '%DifPruebas', 14: '%PruebasPositivasDiarias', 15: 'Recuperados', 16: 'NuevosRecuperados', 17: '%DifRecuperados', 18: 'Hospitalizados', 19: 'NuevosHospitalizados', 20: '%DifHospitalizados' } # Rearrange header index in Peru full data PER_full_data.rearrange_header_index(new_header) # Save full Peru data set PER_full_data.save_as_csv(top_level_directory + main_config.get_value('PeruFullData')) # Create quadplot object for first tweet quadplot_1 = pu.QuadPlot( [ main_config.get_value('CasesColor'), main_config.get_value('CasesColor'), main_config.get_value('RecoveredColor'), main_config.get_value('HospitalizedColor') ], [ 'Casos Confirmados (ultimos 30 dias)', 'Nuevos Casos Confirmados (ultimos 30 dias)', 'Nuevos Recuperados (ultimos 30 dias)', 'Hospitalizados (ultimos 30 dias)' ], [False, True, True, True], ['bar', 'bar', 'bar', 'bar'], [ 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)' ], [ 'Casos Confirmados (acumulado por dia)', 'Nuevos Casos Confirmados (por dia)', 'Nuevos Recuperados (por dia)', 'Hospitalizados (por dia)' ], [ PER_full_data.get_column('Fecha')[-30:], PER_full_data.get_column('Fecha')[-30:], PER_full_data.get_column('Fecha')[-30:], PER_full_data.get_column('Fecha')[-30:] ], [ PER_full_data.get_column('Casos')[-30:], PER_full_data.get_column('NuevosCasos')[-30:], PER_full_data.get_column('NuevosRecuperados')[-30:], PER_full_data.get_column('Hospitalizados')[-30:] ], current_date + ' | Elaborado por Kurt Manrique-Nino | Datos del Ministerio de Salud del Peru (@Minsa_Peru)', top_level_directory + main_config.get_value('TwitterGraph1'), ravg_days=[7, 7, 7, 7], ravg_labels=[ 'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias' ], ravg_ydata=[ None, PER_full_data.get_column('NuevosCasos'), PER_full_data.get_column('NuevosRecuperados'), PER_full_data.get_column('Hospitalizados') ]) # Create quadplot object for second tweet quadplot_2 = pu.QuadPlot( [ main_config.get_value('DeathsColor'), main_config.get_value('DeathsColor'), main_config.get_value('TestsColor'), main_config.get_value('TestsColor') ], [ 'Nuevos Fallecidos (ultimos 30 dias)', 'Tasa de Letalidad (ultimos 30 dias)', 'Nuevas Pruebas (PM+PR+AG) (ultimos 30 dias)', 'Positividad Diaria (PM+PR+AG) (ultimos 30 dias)' ], [True, True, True, True], ['bar', 'scatter', 'bar', 'scatter'], [ 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)', 'Fecha (YYYY-MM-DD)' ], [ 'Nuevos Fallecidos (por dia)', 'Tasa de Letalidad (acumulado por dia)', 'Nuevas Pruebas (por dia)', 'Positividad Diaria * 100% (PM+PR+AG)' ], [ PER_full_data.get_column('Fecha')[-30:], PER_full_data.get_column('Fecha')[-30:], PER_full_data.get_column('Fecha')[-30:], PER_full_data.get_column('Fecha')[-30:] ], [ PER_full_data.get_column('NuevosFallecidos')[-30:], PER_full_data.get_column('TasaLetalidad')[-30:], PER_full_data.get_column('NuevasPruebas')[-30:], PER_full_data.get_column('%PruebasPositivasDiarias')[-30:] ], current_date + ' | Elaborado por Kurt Manrique-Nino | Datos del Ministerio de Salud del Peru (@Minsa_Peru)', top_level_directory + main_config.get_value('TwitterGraph2'), ravg_days=[7, 7, 7, 7], ravg_labels=[ 'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias', 'Promedio ultimos 7 dias' ], ravg_ydata=[ PER_full_data.get_column('NuevosFallecidos'), PER_full_data.get_column('TasaLetalidad'), PER_full_data.get_column('NuevasPruebas'), PER_full_data.get_column('%PruebasPositivasDiarias') ]) # Generate and store quadplot quadplot_1.export() # Generate and store quadplot quadplot_2.export() # Obtain the last entry of Peru full data latest_entry = PER_full_data.get_latest_entry() # Create instances of tweets to store text and image paths tweet1 = Tweet() tweet2 = Tweet() # Create and add tweet body for first tweet tweet1.set_message( generate_first_tweet_text( top_level_directory + main_config.get_value('TwTemplate1'), latest_entry, int(input_data['Cases24H']))) # Create and add tweet body for second tweet tweet2.set_message( generate_second_tweet_text( top_level_directory + main_config.get_value('TwTemplate2'), latest_entry, PER_full_data.col_row_query('TasaLetalidad', PER_full_data.rows - 2), PER_full_data.col_row_query('%PruebasPositivasDiarias', PER_full_data.rows - 2))) # Add paths to graph images tweet1.add_image(top_level_directory + main_config.get_value('TwitterGraph1')) tweet2.add_image(top_level_directory + main_config.get_value('TwitterGraph2')) # Export tweet messages into a file export_tweets_to_file( top_level_directory + main_config.get_value('TweetExport'), [tweet1, tweet2]) # Reply to @Minsa_Peru with tweet thread twitter_session.send_thread([tweet1, tweet2]) # Update GitHub repository with new data if (sys.platform == 'win32'): update_git_repo_win32(input_data['Date']) else: update_git_repo_linux(input_data['Date'])
def load_skill_data(data_filename, prereq_file, nolink_file): print "Loading Data..." data, headers = du.loadCSVwithHeaders(data_filename) prereqs = du.loadCSV(prereq_file) nolink = du.loadCSV(nolink_file) samples = [] labels = [] for i in range(0, len(headers)): print '{:>2}: {:<18} {:<12}'.format(str(i), headers[i], data[0][i]) print "Hierarchy Structure:" for p in prereqs: print p[0], '->', p[1] students = du.unique(du.transpose(data)[2]) for i in range(0, len(students)): student_set = du.select(data, students[i], '==', 2) for p in prereqs: post = du.select(student_set, p[1], '==', 0) if not len(post) == 0: post = post[0] pre = du.select(du.select(student_set, p[0], '==', 0), post[1], '<', 1) rem = du.select(du.select(student_set, p[0], '==', 0), post[1], '>', 1) if not (len(pre) == 0 or len(rem) == 0): pre = pre[0] rem = rem[0] samp_pre = [] samp_post = [] samp_rem = [] for j in range(3, 8): samp_pre.append(pre[j]) samp_post.append(post[j]) samp_rem.append(rem[j]) samples.append([samp_pre, samp_post, samp_rem, [p[0], p[1]]]) labels.append([1, 0, 0]) # print pre # print post # print rem # print ' ' post = du.select(student_set, p[0], '==', 0) if not len(post) == 0: post = post[-1] pre = du.select(du.select(student_set, p[1], '==', 0), post[1], '<', 1) rem = du.select(du.select(student_set, p[1], '==', 0), post[1], '>', 1) if not (len(pre) == 0 or len(rem) == 0): pre = pre[0] rem = rem[0] samp_pre = [] samp_post = [] samp_rem = [] for j in range(3, 8): samp_pre.append(pre[j]) samp_post.append(post[j]) samp_rem.append(rem[j]) samples.append([samp_pre, samp_post, samp_rem, [p[1], p[0]]]) labels.append([0, 0, 1]) # print pre # print post # print rem # print ' ' for p in nolink: post = du.select(student_set, p[1], '==', 0) if not len(post) == 0: post = post[0] pre = du.select(du.select(student_set, p[0], '==', 0), post[1], '<', 1) rem = du.select(du.select(student_set, p[0], '==', 0), post[1], '>', 1) if not (len(pre) == 0 or len(rem) == 0): pre = pre[0] rem = rem[0] samp_pre = [] samp_post = [] samp_rem = [] for j in range(3, 8): samp_pre.append(pre[j]) samp_post.append(post[j]) samp_rem.append(rem[j]) samples.append([samp_pre, samp_post, samp_rem, [p[0], p[1]]]) labels.append([0, 1, 0]) # print pre # print post # print rem # print ' ' # ================================================================= if len(labels) == 0: print "\nNO USABLE SAMPLES EXIST" exit() du.print_label_distribution(labels, ['Prerequisite','Non-Link','Reversed']) samples,labels = du.shuffle(samples, labels) return samples,labels
def get_knowledge_effect(self): return du.MAX( np.random.normal(self.knowledge_effect, self.knowledge_effect_std), 0) def get_speed_effect(self): return du.MAX( np.random.normal(self.speed_effect, self.speed_effect_std), 0) def get_hint_effect(self): return du.MAX(np.random.normal(self.hint_effect, self.hint_effect_std), 0) if __name__ == "__main__": data, headers = du.loadCSVwithHeaders('filtered_data.csv') students = [] num_students = 1000 print "Generating data for", num_students, "students..." for i in range(0, num_students): index = du.rand(0, len(data)) #self, hint, speed, knowledge, h_sd, s_sd, k_sd): students.append( Student(data[index][7], data[index][1], data[index][4], data[index][8], data[index][2], data[index][5])) # difficulty is probability of correctness (higher is easier) A = Skill('A', 0.6, 0.05)
def main(): print("Program Start") headers = [ "Data set", "layers", "pop", "Beta", "CR", "generations", "loss1", "loss2" ] filename = 'VIDEORESULTS.csv' Per = Performance.Results() Per.PipeToFile([], headers, filename) data_sets = [ "soybean", "glass", "abalone", "Cancer", "forestfires", "machine" ] regression_data_set = { "soybean": False, "Cancer": False, "glass": False, "forestfires": True, "machine": True, "abalone": True } categorical_attribute_indices = { "soybean": [], "Cancer": [], "glass": [], "forestfires": [], "machine": [], "abalone": [] } tuned_0_hl = { "soybean": { "omega": .5, "c1": .1, "c2": 5, "hidden_layer": [] }, "Cancer": { "omega": .5, "c1": .5, "c2": 5, "hidden_layer": [] }, "glass": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [] }, "forestfires": { "omega": .2, "c1": 5, "c2": .5, "hidden_layer": [] }, "machine": { "omega": .5, "c1": .9, "c2": 5, "hidden_layer": [] }, "abalone": { "omega": .2, "c1": 5, "c2": .9, "hidden_layer": [] } } tuned_1_hl = { "soybean": { "omega": .5, "c1": .5, "c2": 1, "hidden_layer": [7] }, "Cancer": { "omega": .2, "c1": .5, "c2": 5, "hidden_layer": [4] }, "glass": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [8] }, "forestfires": { "omega": .2, "c1": 5, "c2": 5, "hidden_layer": [8] }, "machine": { "omega": .5, "c1": 5, "c2": .5, "hidden_layer": [4] }, "abalone": { "omega": .2, "c1": .1, "c2": 5, "hidden_layer": [8] } } tuned_2_hl = { "soybean": { "omega": .5, "c1": .9, "c2": .1, "hidden_layer": [7, 12] }, "Cancer": { "omega": .2, "c1": .5, "c2": 5, "hidden_layer": [4, 4] }, "glass": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [8, 6] }, "forestfires": { "omega": .2, "c1": .9, "c2": 5, "hidden_layer": [8, 8] }, "machine": { "omega": .2, "c1": .9, "c2": .1, "hidden_layer": [7, 2] }, "abalone": { "omega": .2, "c1": 5, "c2": 5, "hidden_layer": [6, 8] } } du = DataUtility.DataUtility(categorical_attribute_indices, regression_data_set) total_counter = 0 for data_set in data_sets: if data_set != 'Cancer': continue data_set_counter = 0 # ten fold data and labels is a list of [data, labels] pairs, where # data and labels are numpy arrays: tenfold_data_and_labels = du.Dataset_and_Labels(data_set) for j in range(10): test_data, test_labels = copy.deepcopy(tenfold_data_and_labels[j]) #Append all data folds to the training data set remaining_data = [ x[0] for i, x in enumerate(tenfold_data_and_labels) if i != j ] remaining_labels = [ y[1] for i, y in enumerate(tenfold_data_and_labels) if i != j ] #Store off a set of the remaining dataset X = np.concatenate(remaining_data, axis=1) #Store the remaining data set labels labels = np.concatenate(remaining_labels, axis=1) print(data_set, "training data prepared") regression = regression_data_set[data_set] #If the data set is a regression dataset if regression == True: #The number of output nodes is 1 output_size = 1 #else it is a classification data set else: #Count the number of classes in the label data set output_size = du.CountClasses(labels) #Get the test data labels in one hot encoding test_labels = du.ConvertLabels(test_labels, output_size) #Get the Labels into a One hot encoding labels = du.ConvertLabels(labels, output_size) input_size = X.shape[0] data_set_size = X.shape[1] + test_data.shape[1] tuned_parameters = [ tuned_0_hl[data_set], tuned_1_hl[data_set], tuned_2_hl[data_set] ] for z in range(1): hidden_layers = tuned_parameters[z]["hidden_layer"] layers = [input_size] + hidden_layers + [output_size] nn = NeuralNetwork(input_size, hidden_layers, regression, output_size) nn.set_input_data(X, labels) nn1 = NeuralNetwork(input_size, hidden_layers, regression, output_size) nn1.set_input_data(X, labels) nn2 = NeuralNetwork(input_size, hidden_layers, regression, output_size) nn2.set_input_data(X, labels) total_weights = 0 for i in range(len(layers) - 1): total_weights += layers[i] * layers[i + 1] hyperparameters = { "population_size": 10 * total_weights, "beta": .5, "crossover_rate": .6, "max_gen": 100 } hyperparameterss = { "maxGen": 100, "pop_size": 100, "mutation_rate": .5, "mutation_range": 10, "crossover_rate": .5 } hyperparametersss = { "position_range": 10, "velocity_range": 1, "omega": .1, # tuned_parameters[z]["omega"], "c1": .9, # tuned_parameters[z]["c1"], "c2": .1, # tuned_parameters[z]["c2"], "vmax": 1, "pop_size": 1000, "max_t": 50 } de = DE.DE(hyperparameters, total_weights, nn) ga = GA.GA(hyperparameterss, total_weights, nn1) pso = PSO.PSO(layers, hyperparametersss, nn2) learning_rate = 3 momentum = 0 VNN = VideoNN.NeuralNetworks(input_size, hidden_layers, regression, output_size, learning_rate, momentum) VNN.set_input_data(X, labels) for gen in range(de.maxgens): de.mutate_and_crossover() for gen in range(ga.maxGen): ga.fitness() ga.selection() ga.crossover() counter = 0 for epoch in range(pso.max_t): pso.update_fitness() pso.update_position_and_velocity() for epoch in range(100): VNN.forward_pass() VNN.backpropagation_pass() bestSolution = de.bestChromie.getchromie() bestWeights = de.nn.weight_transform(bestSolution) de.nn.weights = bestWeights Estimation_Values = de.nn.classify(test_data, test_labels) Estimation_Values1 = ga.nn.classify(test_data, test_labels) Estimation_Values2 = pso.NN.classify(test_data, test_labels) Estimation_Values3 = VNN.classify(test_data, test_labels) if regression == False: #Decode the One Hot encoding Value Estimation_Values = de.nn.PickLargest(Estimation_Values) test_labels_list = de.nn.PickLargest(test_labels) Estimation_Values1 = ga.nn.PickLargest(Estimation_Values1) Tll = ga.nn.PickLargest(test_labels) Estimation_Values2 = pso.NN.PickLargest(Estimation_Values2) tll1 = pso.NN.PickLargest(test_labels) Estimation_Values3 = VNN.PickLargest(Estimation_Values3) tll = VNN.PickLargest(test_labels) # print("ESTiMATION VALUES BY GIVEN INDEX (CLASS GUESS) ") # print(Estimation_Values) else: Estimation_Values = Estimation_Values.tolist() test_labels_list = test_labels.tolist()[0] Estimation_Values = Estimation_Values[0] Estimat = Estimation_Values groun = test_labels_list meta = list() Nice = Per.ConvertResultsDataStructure(groun, Estimat) Nice1 = Per.ConvertResultsDataStructure( Tll, Estimation_Values1) Nice2 = Per.ConvertResultsDataStructure( tll1, Estimation_Values2) Nice3 = Per.ConvertResultsDataStructure( tll, Estimation_Values3) DEss = Per.StartLossFunction(regression, Nice, meta) GAss = Per.StartLossFunction(regression, Nice1, meta) PSOSS = Per.StartLossFunction(regression, Nice2, meta) VNNS = Per.StartLossFunction(regression, Nice3, meta) print("DE") print(DEss) print("GA") print(GAss) print("PSO") print(PSOSS) print("NN Back prop.") print(VNNS) # print("THE GROUND VERSUS ESTIMATION:") # print(Nice) # headers = ["Data set", "layers", "pop", "Beta", "CR", "generations", "loss1", "loss2"] Meta = [ data_set, len(hidden_layers), hyperparameters["population_size"], hyperparameters["beta"], hyperparameters["crossover_rate"], hyperparameters["max_gen"] ] Per.StartLossFunction(regression, Nice, Meta, filename) data_set_counter += 1 total_counter += 1 print("Program End ")
def train(self, training, output=None): if output is None: output = training assert len(training) == len(output) self.num_input = du.len_deepest(training) self.num_output = du.len_deepest(output) training = du.transpose(training) output = du.transpose(output) for i in range(0,len(training)): training[i] = du.normalize(training[i]) output[i] = du.normalize(output[i]) training = du.transpose(training) output = du.transpose(output) if not self.isBuilt: self.build_network() print "Input Nodes:", self.num_input print "Output Nodes:", self.num_output # introduce cross-validation from sklearn.cross_validation import StratifiedKFold strat_label = [] for i in range(0, len(training)): strat_label.append(1) skf = StratifiedKFold(strat_label, n_folds=self.num_folds) print"Number of Folds:", len(skf) print "Training Samples:", len(training) print("\nTraining AutoEncoder...") print "{:<9}".format(" Epoch"), \ "{:<9}".format(" Train"), \ "{:<9}".format(" Valid"), \ "{:<9}".format(" Time"), \ "\n======================================" start_time = time.clock() train_err = [] val_err = [] # for each epoch... for e in range(0, self.num_epochs): epoch_time = time.clock() epoch = 0 eval = 0 n_train = 0 n_test = 0 # train and test for ktrain, ktest in skf: for i in range(0, len(ktrain), self.batch_size): batch_sample = [] batch_label = [] # create a batch of training samples for j in range(i, min(len(ktrain), i + self.batch_size)): #print training[ktrain[j]] batch_sample.append(training[ktrain[j]]) batch_label.append(output[ktrain[j]]) # update and get the cost #print batch_sample #print self.get_output(batch_sample) #print batch_label epoch += self.train_network(batch_sample, batch_label) n_train += 1 sample = [] label = [] for i in range(0, len(ktest)): sample.append(training[ktest[i]]) label.append(output[ktest[i]]) n_test += 1 eval += self.test_network(sample, label) train_err.append(epoch / n_train) val_err.append(eval / n_test) print "{:<11}".format("Epoch " + str(e + 1) + ":"), \ "{:<9}".format("{0:.4f}".format(epoch / n_train)), \ "{:<9}".format("{0:.4f}".format(eval / n_test)), \ "{:<9}".format("{0:.1f}s".format(time.clock() - epoch_time)) print "Total Training Time:", "{0:.1f}s".format(time.clock() - start_time)
def __init__(self, name, difficulty=0.5, difficulty_std=0.1): self.problems = [] self.name = name for i in range(0, 10000): self.problems.append( du.clamp(np.random.normal(difficulty, difficulty_std), 0, 1))
def get_hint_effect(self): return du.MAX(np.random.normal(self.hint_effect, self.hint_effect_std), 0)
def get_knowledge_effect(self): return du.MAX( np.random.normal(self.knowledge_effect, self.knowledge_effect_std), 0)
"cr": .8, "hidden_layer": [6, 8] } } ############################################## # START MULTIPROCESS JOB POOL ############################################## manager = multiprocessing.Manager() q = manager.Queue() writer = multiprocessing.Process(target=data_writer, args=(q, filename)) writer.start() pool = multiprocessing.Pool() ############################################## du = DataUtility.DataUtility(categorical_attribute_indices, regression_data_set) total_counter = 0 for data_set in data_sets: regression = regression_data_set[data_set] tuned_parameters = [ tuned_0_hl[data_set], tuned_1_hl[data_set], tuned_2_hl[data_set] ] data_set_counter = 0 # ten fold data and labels is a list of [data, labels] pairs, where # data and labels are numpy arrays: tenfold_data_and_labels = du.Dataset_and_Labels(data_set) for j in range(10): data_package = generate_data_package(
def get_speed_effect(self): return du.MAX( np.random.normal(self.speed_effect, self.speed_effect_std), 0)
if len(item_both_rated) == 0: continue # 获取用户 a 和用户 b 都评价的商品的评价值 user_a_ = [user_a[i] for i in item_both_rated] user_b_ = [user_b[i] for i in item_both_rated] # 根据用户 a 和用户 b 都评价的商品的评价值,计算Pearson 相关系数 sim = np.corrcoef(user_a_, user_b_)[0, 1] # 如果相关系数大于阈值,那么加入到评分预测值计算中 if sim >= self.limit: predict_up += sim * (user_b[testItem] - np.mean(user_b_)) predict_down += sim return 1 if predict_down == 0 else predict_up / predict_down + np.sum( user_a) / np.count_nonzero(user_a) df_train = DataUtility.getTrainData() # df_train = df_train.iloc[0:int(df_train.shape[0] / 10), 0:int(df_train.shape[1])] # df_test = DataUtility.getTestData() CF = collaborativeFiltering(df_train, 0.5) test_data = [] user = list(df_train.index) item = list(df_train.columns) for i in range(int(df_train.shape[0] / 10)): for j in range(df_train.shape[1]): if df_train.iloc[i, j] == 0: continue else: test_data.append([user[i], item[j], df_train.iloc[i, j]]) break test_data = [[t[0], t[1], t[2]] for t in test_data if t[2] != 0] p = CF.predict(test_data)
self.hint_effect_std = hint_std SkillLink.list.append(self) def get_knowledge_effect(self): return du.MAX(np.random.normal(self.knowledge_effect, self.knowledge_effect_std), 0) def get_speed_effect(self): return du.MAX(np.random.normal(self.speed_effect, self.speed_effect_std), 0) def get_hint_effect(self): return du.MAX(np.random.normal(self.hint_effect, self.hint_effect_std), 0) if __name__ == "__main__": data, headers = du.loadCSVwithHeaders('filtered_data.csv') students = [] num_students = 1000 print "Generating data for", num_students, "students..." for i in range(0,num_students): index = du.rand(0,len(data)) #self, hint, speed, knowledge, h_sd, s_sd, k_sd): students.append(Student(data[index][7], data[index][1], data[index][4], data[index][8], data[index][2], data[index][5]))