def initialize_instances(filename): """Read the abalone.txt CSV data into a list of instances.""" print('Creating instances') instances = [] inFile = open(filename, 'rb') reader = csv.reader(inFile, delimiter=',') skipHeader = True for row in reader: if skipHeader: skipHeader = False continue # print(len(row[1:-1])) # print(len([float(value) for value in row[1:-1]])) instance = Instance([float(value) for value in row[1:-1] ]) #ignore the index and the label instance.setLabel( Instance(0 if float(row[-1]) < 15 else 1)) #set the label instances.append(instance) inFile.close() print('Finished instances') return instances
def train(oa, network, oaName, instances, measure): """Train a given network on a set of instances. :param OptimizationAlgorithm oa: :param BackPropagationNetwork network: :param str oaName: :param list[Instance] instances: :param AbstractErrorMeasure measure: """ print "\nError results for %s\n---------------------------" % (oaName, ) for iteration in xrange(TRAINING_ITERATIONS): oa.train() error = 0.00 for instance in instances: network.setInputValues(instance.getData()) network.run() output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) print "%0.03f" % error
def errorOnDataSet(network,ds,measure): N = len(ds) error = 0. correct = 0 incorrect = 0 count = 0 for instance in ds: count +=1 network.setInputValues(instance.getData()) network.run() actual = instance.getLabel().getContinuous() predicted = network.getOutputValues().get(0) predicted = max(min(predicted,1),0) if abs(predicted - actual) < 0.5: correct += 1 else: incorrect += 1 output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) MSE = error/float(N) acc = correct/float(correct+incorrect) return MSE,acc
def train(oa, network, oaName, instances, measure = SumOfSquaresError(), surpress_output=False, TRAINING_ITERATIONS = 1500): """Train a given network on a set of instances. :param OptimizationAlgorithm oa: :param BackPropagationNetwork network: :param str oaName: :param list[Instance] instances: :param AbstractErrorMeasure measure: """ if not surpress_output: print "\nError results for %s every 100 " \ "iterations\n---------------------------" % (oaName,) for i in xrange(TRAINING_ITERATIONS): oa.train() error = 0.00 for instance in instances: network.setInputValues(instance.getData()) network.run() output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) if not surpress_output and i % 100 == 0: print "%0.03f" % error
def initialize_instances(infile): """Read the given CSV data into a list of instances.""" instances = [] # Read in the CSV file with open(infile, "r") as dat: reader = csv.reader(dat) for row in reader: instance = Instance([float(value) for value in row[:-1]]) # TODO: Set to <= 0 to handle 0/1 labels and not just -1/1? # assumes labels are integers from 0 to NUM_OF_CLASSES-1 label = int(float(row[-1])) classes = [0] * NUM_OF_CLASSES classes[label] = 1 instance.setLabel(Instance(classes)) instances.append(instance) return instances
def errorOnDataSet(network, ds, measure): N = len(ds) error = 0. correct = 0 incorrect = 0 for instance in ds: network.setInputValues(instance.getData()) network.run() # actual = instance.getLabel().getData().argMax() actual = instance.getLabel().getData().get(0) # predicted = network.getOutputValues().argMax() predicted = network.getOutputValues().get(0) if predicted > 0.5: predicted = 1.0 else: predicted = 0.0 # print("Actual = ", actual, " , Predicted = ", predicted) if actual == predicted: correct += 1 else: incorrect += 1 output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values)) error += measure.value(output, example) MSE = error / float(N) acc = correct / float(correct + incorrect) return MSE, acc
def train(oa, network, oaName, instances, measure): """Train a given network on a set of instances. :param OptimizationAlgorithm oa: :param BackPropagationNetwork network: :param str oaName: :param list[Instance] instances: :param AbstractErrorMeasure measure: """ print "\nError results for %s\n---------------------------" % (oaName, ) FILE_NAME = oaName + ".csv" OUTPUT_FILE = os.path.join("data", FILE_NAME) with open(OUTPUT_FILE, "wb") as results: writer = csv.writer(results, delimiter=',') for iteration in xrange(TRAINING_ITERATIONS): oa.train() error = 0.00 for instance in instances: network.setInputValues(instance.getData()) network.run() output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) print error / len(instances) writer.writerow([error / len(instances)])
def train(oa, network, oaName, instances, measure, fileobject): """Train a given network on a set of instances. :param OptimizationAlgorithm oa: :param BackPropagationNetwork network: :param str oaName: :param list[Instance] instances: :param AbstractErrorMeasure measure: """ fileobject.write(str(oaName) + " training " + "\n") for iteration in xrange(TRAINING_ITERATIONS): oa.train() error = 0.00 for instance in instances: network.setInputValues(instance.getData()) network.run() output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) print "finished iter", iteration, "for", oaName fileobject.write(str(oaName) + "," + str(iteration) + "," + str(error) + "\n")
def eval_instances(net, instances, measure): # get the accuracy of the set (training, test, validation) set_len = len(instances) right, wrong, error = 0, 0, 0. for i in instances: net.setInputValues(i.getData()) net.run() # should only need first output binary class truth = i.getLabel().getContinuous() n_out = net.getOutputValues().get(0) if int(truth) == int(n_out): right += 1 else: wrong += 1 output = i.getLabel() output_values = net.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) accuracy = float(right) / float(set_len) error = error / float(set_len) return accuracy, error
def error_on_data_set(network, ds, measure, ugh=False): N = len(ds) error = 0. correct = 0 incorrect = 0 actuals = [] predicteds = [] for instance in ds: network.setInputValues(instance.getData()) network.run() actual_out = instance.getLabel() predicted_out = network.getOutputValues() predicted = [] actual = [] for j in range(0, predicted_out.size()): predicted.append(max(min(predicted_out.get(j), 1), 0)) if sum([round(cur) for cur in predicted]) > 1: print "FOUND TWO ONES : {}".format( [round(cur) for cur in predicted]) elif sum([round(cur) for cur in predicted]) == 0: print "ALL ZEROS : {}".format( [round(cur) for cur in predicted]) for k in range(0, actual_out.getData().size()): actual.append(round(actual_out.getData().get(k))) if ugh: print "label: {}".format(instance.getLabel()) print "actual: {}, predicted: {}".format(actual_out, predicted_out) predicteds.append([round(cur) for cur in predicted]) actuals.append([max(min(cur, 1), 0) for cur in actual]) ind = max(xrange(len(predicted)), key=predicted.__getitem__) if actual[ind] == 1: correct += 1 if ugh: print "CORRECT" else: incorrect += 1 if ugh: print "INCORRECT" output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values)) error += measure.value(output, example) if ugh: print "error: {}".format(measure.value(output, example)) MSE = error / float(N) acc = correct / float(correct + incorrect) precision, recall, f1 = f1_score(actuals, predicteds) if ugh: print "MSE: {}, acc: {}, f1: {} (precision: {}, recall: {})".format( MSE, acc, f1, precision, recall) import sys sys.exit(0) return MSE, acc, f1
def errorOnDataSet(network, ds, measure): N = len(ds) error = 0. correct = 0 incorrect = 0 false_positives = 0.0 false_negatives = 0.0 true_positives = 0.0 true_negatives = 0.0 for instance in ds: network.setInputValues(instance.getData()) network.run() actual = instance.getLabel().getContinuous() predicted = network.getOutputValues().get(0) predicted = max(min(predicted, 1), 0) # Measure type of error for F1 score if actual == 0.0 and predicted >= 0.5: false_positives += 1.0 if actual == 0.0 and predicted < 0.5: true_negatives += 1.0 if actual == 1.0 and predicted >= 0.5: true_positives += 1.0 if actual == 1.0 and predicted < 0.5: false_negatives += 1.0 if abs(predicted - actual) < 0.5: correct += 1 else: incorrect += 1 output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) MSE = error / float(N) acc = correct / float(correct + incorrect) try: precision = true_positives / (true_positives + false_positives) except ZeroDivisionError: precision = 0.0 try: recall = true_positives / (true_positives + false_negatives) except ZeroDivisionError: recall = 0.0 try: F1 = 2.0 * ((precision * recall) / (precision + recall)) except ZeroDivisionError: F1 = 0.0 return MSE, acc, F1
def initialize_instances(): """Read the abalone.txt CSV data into a list of instances.""" instances = [] with open(TRAIN_FILE, "r") as abalone: reader = csv.reader(abalone) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if int(row[-1]) == 0 else 1)) instances.append(instance) return instances
def get_test_instances(): """Read the optdigits test CSV data into a list of instances.""" instances = [] with open(TEST_FILE, "r") as f: reader = csv.reader(f) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if int(row[-1]) == 0 else 1)) instances.append(instance) return instances
def train(oa, network, oaName, train_set, test_set, measure, max_iterations=TRAINING_ITERATIONS): """Train a given network on a set of instances. :param OptimizationAlgorithm oa: :param BackPropagationNetwork network: :param str oaName: :param list[Instance] instances: :param AbstractErrorMeasure measure: """ train_instances = train_set.getInstances() test_instances = test_set.getInstances() fname = 'out/error/%s.csv' % (oaName) with open(fname, 'w') as f: # print "\nError results for %s\n---------------------------" % (oaName,) for iteration in xrange(max_iterations): oa.train() train_error = test_error = 0.00 for train_instance in train_instances: network.setInputValues(train_instance.getData()) network.run() output = train_instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) train_error += measure.value(output, example) for test_instance in test_instances: network.setInputValues(test_instance.getData()) network.run() output = test_instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) test_error += measure.value(output, example) train_error_norm = train_error / len(train_instances) test_error_norm = test_error / len(test_instances) f.write("%d,%0.05f,%0.05f\n" % (iteration, train_error_norm, test_error_norm)) print('Error written to %s' % (fname))
def train(oa, network, oaName, instances, measure): """Train a given network on a set of instances. :param OptimizationAlgorithm oa: :param BackPropagationNetwork network: :param str oaName: :param list[Instance] instances: :param AbstractErrorMeasure measure: """ #print "\nError results for %s\n---------------------------" % (oaName,) # training error each iteration iterdata = [] training_time = 0. for iteration in TRAINING_ITERATIONS: #if oaName == "GA" and iteration >= int(len(TRAINING_ITERATIONS)/5): # continue if iteration >= int(len(TRAINING_ITERATIONS) / 5): break start = time.time() oa.train() end = time.time() training_time += end - start if iteration % interval != 0: continue correct, incorrect = 0, 0 error = 0. for instance in instances[:1000]: network.setInputValues(instance.getData()) network.run() actual = instance.getLabel().getContinuous() predicted = network.getOutputValues().get(0) if abs(predicted - actual) < 0.5: correct += 1 else: incorrect += 1 output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) #print "output, example,error",output, example,measure.value(output, example) accuracy = 1. * correct / (correct + incorrect) iterdata.append([iteration, accuracy, error, training_time]) # print 'iteration,accuracy,error,training_time',iteration,accuracy,error,training_time return iterdata
def initialize_instances(infile): instances = [] with open(infile, "r") as dat: reader = csv.reader(dat) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if float(row[-1]) <= 0 else 1)) instances.append(instance) return instances
def get_error(data, network, measure): error = 0.00 for j, instance in enumerate(data): network.setInputValues(instance.getData()) network.run() output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) return error
def maybe_serialize(file, force=False): serialized_file = os.path.splitext(file)[0] + '.ser' if not os.path.isfile(serialized_file) or force: stdout.write("Serializing Data-Set...\n\n") with open(file, "r") as pima: reader_list = list(csv.reader(pima)) stdout.write("Some sample, un-shuffled data: \n%s\n\n" % reader_list[:3]) normalize_data(reader_list) random.shuffle(reader_list) number_of_instances = len(reader_list) train_instances = [] for row in reader_list[:int(number_of_instances * TRAIN_TEST_SPLIT_RATIO)]: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if float(row[-1]) == -1 else 1)) train_instances.append(instance) test_instances = [] for row in reader_list[int(number_of_instances * TRAIN_TEST_SPLIT_RATIO):]: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if float(row[-1]) == -1 else 1)) test_instances.append(instance) stdout.write("Some sample, shuffled training data (after " "normalization): " "\n%s\n\n" % train_instances[:3]) stdout.write("Some sample, shuffled test data (after " "normalization): \n%s\n\n" % test_instances[:3]) stdout.write("Train Data\tTest Data\n") stdout.write("%s\t\t%s\n" % (len(train_instances), len(test_instances))) save = { TRAIN: train_instances, TEST: test_instances, } outFile = io.FileOutputStream(serialized_file) outStream = io.ObjectOutputStream(outFile) outStream.writeObject(save) outFile.close() else: stdout.write("Serialized file for data-set found.\n") return serialized_file
def initialize_instances(filename, i): instances = [] with open(filename, "r") as f: reader = csv.reader(f) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(int(row[-1]))) instances.append(instance) return instances
def read_dataset(path, pos_set): instances = [] with open(path, 'r') as stream: reader = csv.reader(stream) times = 0 for row in reader: if times: instance = Instance([float(value) for value in row[:-1]]) label = 1 if float(row[-1]) in pos_set else 0 instance.setLabel(Instance(label)) instances.append(instance) times += 1 return instances
def initialize_instances(infile): """Read the m_trg.csv CSV data into a list of instances.""" instances = [] # Read in the CSV file with open(infile, "r") as dat: reader = csv.reader(dat) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if float(row[-1]) < 0 else 1)) instances.append(instance) return instances
def initialize_instances(): """Read the data into a list of instances.""" instances = [] # Read in the CSV file with open(INPUT_FILE, "r") as abalone: reader = csv.reader(abalone) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(1 if row[-1] == "crisis" else 0)) instances.append(instance) return instances
def initialize_instances(): """Read the abalone.txt CSV data into a list of instances.""" instances = [] # Read in the abalone.txt CSV file with open(INPUT_FILE, "r") as abalone: reader = csv.reader(abalone) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if float(row[-1]) < 15 else 1)) instances.append(instance) return instances
def initialize_instances(): """Read the gamma.txt CSV data into a list of instances.""" instances = [] # Read in the gamma.txt CSV file with open(INPUT_FILE, "r") as gamma: reader = csv.reader(gamma) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if float(row[-1]) == 0 else 1)) instances.append(instance) return instances
def initialize_instances(examples): """Read CSV data into a list of instances.""" instances = [] # Read in the abalone.txt CSV file with open(examples, "r") as gamma: reader = csv.reader(gamma) for row in reader: instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if row[-1] == 'g' else 1)) instances.append(instance) return instances
def initialize_instances(): """Read the abalone.txt CSV data into a list of instances.""" instances = [] # Read in the abalone.txt CSV file with open(INPUT_FILE, "r") as abalone: reader = csv.reader(abalone) for row in reader: instance = Instance([float(value) for value in row[1:-1]]) instance.setLabel(Instance(0 if float(row[-1]) == 2 else 1)) # 2 is benign, 4 malignant instances.append(instance) return instances
def initialize_instances(test=False): """Read the abalone.txt CSV data into a list of instances.""" instances = [] if test: INPUT_FILE = TEST_FILE else: INPUT_FILE = TRAIN_FILE with open(INPUT_FILE, "r") as chess: reader = csv.reader(chess) for row in reader: instance = Instance([float(value) for value in row[:-1]]) if do_chess: instance.setLabel(Instance(float(row[-1]))) if do_fmnist: classes = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] index = int(row[-1]) # print "Value is: ", row[-1], " index is: ", index classes[index] = 1.0 temp = Instance(classes) # print "Size is: ", temp.size() instance.setLabel(temp) instances.append(instance) return instances
def error_on_data_set(network, ds, measure, ugh=False): N = len(ds) error = 0. correct = 0 incorrect = 0 actuals = [] predicteds = [] # ugh=True for instance in ds: network.setInputValues(instance.getData()) # print instance # print instance.getData() # print instance.getLabel().getContinuous() network.run() actual = instance.getLabel().getContinuous() predicted = network.getOutputValues().get(0) # print "Predicted 1", network.getOutputValues() predicted = max(min(predicted, 1), 0) if ugh: print "label: {}".format(instance.getLabel()) print "actual: {}, predicted: {}".format(actual, predicted) predicteds.append(round(predicted)) actuals.append(max(min(actual, 1), 0)) if abs(predicted - actual) < 0.5: correct += 1 if ugh: print "CORRECT" else: incorrect += 1 if ugh: print "INCORRECT" output = instance.getLabel() output_values = network.getOutputValues() example = Instance(output_values, Instance(output_values.get(0))) error += measure.value(output, example) if ugh: print "error: {}".format(measure.value(output, example)) MSE = error / float(N) acc = correct / float(correct + incorrect) precision, recall, f1 = f1_score(actuals, predicteds) if ugh: print "MSE: {}, acc: {}, f1: {} (precision: {}, recall: {})".format( MSE, acc, f1, precision, recall) import sys sys.exit(0) return MSE, acc, f1
def initialize_instances(): """Read the abalone.txt CSV data into a list of instances.""" instances = [] # Read in the abalone.txt CSV file with open(INPUT_FILE, "r") as abalone: reader = csv.reader(abalone) for row in reader: INPUT_LAYER = len(row) - 1 instance = Instance([float(value) for value in row[:-1]]) instance.setLabel(Instance(0 if float(row[-1]) < 0.5 else 1)) instances.append(instance) #print instances return instances
def initialize_instances(infile): """Read the given CSV data into a list of instances.""" instances = [] # Read in the CSV file with open(infile, "r") as dat: reader = csv.reader(dat) for row in reader: instance = Instance([float(value) for value in row[:-1]]) # TODO: Set to <= 0 to handle 0/1 labels and not just -1/1? instance.setLabel(Instance(0 if float(row[-1]) < 0.001 else 1)) instances.append(instance) return instances
def initialize_instances(file_path): """Read the abalone.txt CSV data into a list of instances.""" instances = [] # Read in the adult_train.txt CSV file with open(file_path, "r") as adult: reader = csv.reader(adult) for row in reader: instance = Instance([float(value) for value in row[:-1]]) # my data was already preprocessed, so this basically does nothing but appends my data to instances instance.setLabel(Instance(0 if float(row[-1]) < 1 else 1)) instances.append(instance) return instances
def read_data_files(self): """ Read a scikit data set. The entire file should be numbers, so I'm not bothering with CSV or anything fancy. Just numbers separated by spaces. :return: None """ # Set the data file names data_file = self.dataset_name + ".data" target_file = self.dataset_name + ".target" # Check that the files exist if not os.path.isfile(data_file): raise Exception("Data file '" + data_file + "' not found") if not os.path.isfile(data_file): raise Exception("Target file '" + target_file + "' not found") # Read the lines of the data and target files if self.verbose: print("Loading data") d_in = open(data_file, 'r') data_lines = d_in.readlines() d_in.close() t_in = open(target_file, 'r') target_lines = t_in.readlines() t_in.close() # A quick check that there is a one-to-one correspondence between data and target lines self.n_samples = len(data_lines) if not self.n_samples == len(target_lines): raise Exception("Data and Target lengths are not the same.") # Interpret each data and target line pair if self.verbose: print("Interpreting data") self.samples = [] self.n_features = len(data_lines[0].split()) self.n_targets = len(target_lines[0].split()) for ss in xrange(self.n_samples): data = [] ds = data_lines[ss].split() n_inputs = len(ds) for val in ds: data.append(float(val)) target = [] ds = target_lines[ss].split() n_outputs = len(ds) for val in ds: target.append(float(val)) inst = Instance(data) inst.setLabel(Instance(target)) # Do some checking before we append this if not self.n_features == n_inputs: raise Exception("Line " + str(ss) + ": Number of data points does not match previous lines") if not self.n_targets == n_outputs: raise Exception("Line " + str(ss) + ": Number of targets does not match previous lines") # Append this data pattern self.samples.append(inst) # What type of classification is this? if self.n_targets == 1: self.out_type = "binary" else: self.out_type = "multiclass"