def getTextFileData(x, caseFraction, noClases): no_of_lines = 0 with open('./Data sets/' + x, newline='') as inputfile: no_of_lines = sum(1 for _ in inputfile) iterations = int(no_of_lines * caseFraction) with open('./Data sets/' + x, newline='') as inputfile: if x == 'winequality_red.txt': for row in csv.reader(inputfile, delimiter=';'): if iterations == 0: break iterations -= 1 feature_data = row[:-1] feature_data = list(map(float, feature_data)) feature_class_value = int(row[-1]) feature_class_value = feature_class_value - 3 feature_class = TFT.int_to_one_hot(feature_class_value, noClases) features.append(feature_data) classes.append(feature_class) else: for row in csv.reader(inputfile): if iterations == 0: break iterations -= 1 feature_data = row[:-1] feature_data = list(map(float, feature_data)) feature_class_value = int(row[-1]) feature_class = TFT.int_to_one_hot(feature_class_value, noClases) features.append(feature_data) classes.append(feature_class) scaled = preprocessing.scale(features) for i in range(len(scaled)): dataStructured.append([scaled[i], classes[i]]) return dataStructured
def read_file(filename): cases = [] file_obj = open(filename, 'r') if filename.split('/')[1] == "wine.txt": for line in file_obj.readlines(): line_vec = line.split(';') input_vec = line_vec[:11] label = int(line_vec[-1]) - 1 cases.append( [list(map(float, input_vec)), tft.int_to_one_hot(label, 8)]) if filename.split('/')[1] == "yeast.txt": for line in file_obj.readlines(): line_vec = line.split(',') input_vec = line_vec[:8] label = int(line_vec[-1]) - 1 cases.append( [list(map(float, input_vec)), tft.int_to_one_hot(label, 10)]) if filename.split('/')[1] == "glass.txt": for line in file_obj.readlines(): line_vec = line.split(',') input_vec = line_vec[:9] label = int(line_vec[-1]) - 1 if label > 4: label -= 1 cases.append( [list(map(float, input_vec)), tft.int_to_one_hot(label, 6)]) return cases
def mnist(parameters, loss_function): dataset = 0 digits = [] for p in parameters: if (p == "testing"): dataset = 1 if (p.isdigit()): digits.append(int(p)) output_size = 10 if (len(digits) == 0): images, labels = mb.load_mnist( dataset=("training" if not dataset else "testing")) # If specified which images to get: (i.e. [1, 4, 6]) else: images, labels = mb.load_mnist( dataset=("training" if not dataset else "testing"), digits=digits) # Creating [input, output] - cases, with normalized, flattened images, and int label vectors as output (sparse need integers, not vectors): cases = [[ mb.flatten_image(i) / la.norm(i), TFT.int_to_one_hot(int(l[0]), output_size) ] for (i, l) in zip(images, labels)] print("Total cases collected: ", len(cases)) return cases
def mnist(self): data_set = MNIST.load_mnist() flat_set = MNIST.gen_flat_cases(cases = data_set) return_set = [] for i in range(len(flat_set[0])): return_set.append([flat_set[0][i], TFT.int_to_one_hot(flat_set[1][i], 10)]) return return_set
def dataset_loader(filename, loss_function): print(filename) with open(filename, "r") as file: feature_vectors = [] labels = [] splitter = ";" for line in file: if (len(line) > 0): if (splitter not in line): splitter = "," split_string = line.split(splitter) labels.append(int(split_string[-1])) feature_vectors.append( [float(i) for i in split_string[:len(split_string) - 1]]) print("Nof features: ", len(feature_vectors[0])) print("Nof examples: ", len(feature_vectors)) print(max(labels), min(labels)) # Making one-hot-labels: normalized_labels = normalize_labels(labels) one_hot_labels = [ TFT.int_to_one_hot(l, max(normalized_labels) + 1) for l in normalized_labels ] # Normalizing features in the space [0, 1]: normalized_feature_vectors = normalize_features(feature_vectors) # Creating the case-set: cases = [[f, l] for (f, l) in zip(normalized_feature_vectors, one_hot_labels)] return cases
def read_file(filename): cases = [] file_obj = open(filename, 'r') for line in file_obj.readlines(): line_vec = line.split(',') input_vec = line_vec[:4] label = flower_to_int(str(line_vec[-1]).rstrip()) cases.append([list(map(float, input_vec)), tft.int_to_one_hot(label, 3)]) return cases
def read_glass(self, text_file="dataset/glass.txt"): file_object = open(text_file, "r") k = np.genfromtxt(file_object, delimiter=",") x = k[:, :9] x = self.normalize(x).tolist() y = k[:, 9:].tolist() for i in range(len(x)): self.cases.append([x[i], TFT.int_to_one_hot(int(y[i][0]) - 1, 7)])
def read_wine(self, text_file="dataset/winequality_red.txt"): file_object = open(text_file, "r") k = np.genfromtxt(file_object, delimiter=";") x = k[:, :11] x = self.normalize(x).tolist() y = k[:, 11:].tolist() for i in range(len(x)): self.cases.append([x[i], TFT.int_to_one_hot(int(y[i][0]) - 3, 6)])
def load_glass_dataset(): data = np.loadtxt('data/glass.txt', delimiter=',') # targets between 1 and 7, no examples of class 4 # reducing class labels above 4 by one, to use existing onehot-function for i in range(len(data)): if data[i][-1] >= 5: data[i][-1] -= 1 return [[x[:9], TFT.int_to_one_hot(int(x[9]) - 1, 6)] for x in data]
def generate_cases(self): self.cases = _generator[self.casefunc["data source"]](*( self.casefunc["parameters"] )) # Run the case generator. Case = [input-vector, target-vector] if (self.casefunc["data source"] == "symmetry"): for i in range(len(self.cases)): self.cases[i] = [ self.cases[i][:-1], TFT.int_to_one_hot(int(self.cases[i][-1:][0]), 2) ]
def load_flat_text_cases(filename, cfraction, ): f = open(filename, "r") lines = [line.split(" ") for line in f.read().split("\n")] f.close() len_lines = float(len(lines)) fraction = int(np.ceil(cfraction * len_lines)) new_lines = lines[:fraction] x_l = list(map(int, new_lines[0]))[:(fraction - 1)] # target x_t = [list(map(int, line)) for line in new_lines[1:]] # input np.array(x_t) x_t = np.divide(x_t, 255) x_l = [TFT.int_to_one_hot(i, 10) for i in x_l] return [list(i) for i in zip(x_t, x_l)]
def load_flat_text_cases( filename, dir=__mnist_path__, ): f = open(dir + filename, "r") lines = [line.split(" ") for line in f.read().split("\n")] f.close() x_l = [TFT.int_to_one_hot(int(fv), 10) for fv in lines[0]] x_t = numpy.array([lines[i] for i in range(1, len(lines))]).astype(int) x_t = x_t / 255 #x_t = normalize_inputs(x_t.astype(int)) print(len(x_t)) return [[l, t] for l, t in zip(x_t, x_l)]
def load_mnist(fraction=0.1): mnist = tf.keras.datasets.mnist.load_data(path='mnist.npz') data_length = len(mnist[0][1]) reduced_indices = np.random.choice([i for i in range(data_length)], int(fraction * data_length), replace=False) data = mnist[0][0][reduced_indices] targets = mnist[0][1][reduced_indices] data = [i.flatten() for i in data] output = [[data[i], TFT.int_to_one_hot(targets[i], 10)] for i in range(len(targets))] return output
def readMineFile(self, filename): lines = [line.rstrip('\n') for line in open(self.path + filename)] onehots = ["R", "M"] cases = [] for line in lines: case = [] inp = [] vals = line.split(",") target = onehots.index(vals.pop()) target = TFT.int_to_one_hot(target, 2, floats=True) for val in vals: inp.append(float(val)) case.append(inp) case.append(target) cases.append(case)
def readfile(self, filename, numClasses, custom_buckets, normalize=False): lines = [line.rstrip('\n') for line in open(self.path + filename)] cases = [] for line in lines: case = [] inp = [] line = line.replace(";", ",") vals = line.split(",") if custom_buckets is not None: target = custom_buckets.index(int(vals.pop())) else: target = int(vals.pop()) target = TFT.int_to_one_hot(target, numClasses, floats=True) for val in vals: inp.append(float(val)) case.append(inp) case.append(target) cases.append(case)
def load_generic_file(filename, cfraction, hot_length): with open(filename, 'r') as infile: output_list = [] lines = infile.readlines() fraction = int(np.ceil(cfraction * len(lines))) for line in lines: line_output = [] split_line = line.replace(';', ',') split_line = split_line.strip().split(',') input_vector = [float(i) for i in split_line[:-1]] target_vector = int(split_line[-1]) hot_target = TFT.int_to_one_hot(target_vector, hot_length) line_output.append(input_vector) line_output.append(hot_target) output_list.append(line_output) # have to shuffle to get whole range shuffle(output_list) return output_list[:fraction]
def readDOTAfile(self, filename, onehot=False): lines = [line.rstrip('\n') for line in open(self.path + filename)] cases = [] b = [-1.0, 1.0] for line in lines: vals = line.split(",") inp = [] case = [] if onehot: target = b.index(((float(vals.pop(0))))) target = TFT.int_to_one_hot(target, len(b), floats=True) else: target = float(vals.pop(0)) for val in vals: inp.append(float(val)) case.append(inp) case.append(target) cases.append(case) return cases
def read_mnist(self): cases, label = mnists.load_all_flat_cases() cases = np.array(cases) / 255 for i in range(int(len(cases))): self.cases.append( [cases[i], TFT.int_to_one_hot(int(label[i]), 10)])
def make_one_hot(labels, noClases): ohl = [] for label in labels: ohl.append(TFT.int_to_one_hot(label, noClases)) return ohl
def source(self): def normalize(cases): input = [c[0] for c in cases] target = [c[1] for c in cases] input = numpy.array(input) min_arr = numpy.min(input, axis=0) max_arr = numpy.max(input, axis=0) for element in input: for i, e in enumerate(element): element[i] = (e - min_arr[i]) / (max_arr[i] - min_arr[i]) return list(zip(input, target)) def to_float(inp): # returns 0 if input is ? (questionmark) return 0 if inp == '?' else float(inp) self.source_is_called = True print("source:", self.args.source) data_set = [] if self.args.source[-4:] == ".txt": with open("data_set_files/" + self.args.source) as file: data = list( map(lambda x: re.split("[;,]", x), file.readlines())) data = list(map(lambda x: list(map(to_float, x)), data)) max_d = max(map(lambda x: int(x[-1]), data)) for element in data: input = element[:-1] target = TFT.int_to_one_hot(int(element[-1]) - 1, max_d) data_set.append([input, target]) elif self.args.source == "parity": if self.args.sourceinit is None: data_set = TFT.gen_all_parity_cases(10) else: data_set = TFT.gen_all_parity_cases(self.args.sourceinit[0]) elif self.args.source == "symmetry": if self.args.sourceinit is None: vecs = TFT.gen_symvect_dataset(101, 2000) else: vecs = TFT.gen_symvect_dataset(self.args.sourceinit[0], self.args.sourceinit[1]) inputs = list(map(lambda x: x[:-1], vecs)) targets = list(map(lambda x: TFT.int_to_one_hot(x[-1], 2), vecs)) data_set = list(zip(inputs, targets)) elif self.args.source == "auto_onehot": if self.args.sourceinit is None: data_set = TFT.gen_all_one_hot_cases(64) else: data_set = TFT.gen_all_one_hot_cases(self.args.sourceinit[0]) elif self.args.source == "auto_dense": if self.args.sourceinit is None: data_set = TFT.gen_dense_autoencoder_cases(2000, 100) else: data_set = TFT.gen_dense_autoencoder_cases( self.args.sourceinit[0], self.args.sourceinit[1]) elif self.args.source == "bitcounter": if self.args.sourceinit is None: data_set = TFT.gen_vector_count_cases(500, 15) else: data_set = TFT.gen_vector_count_cases(self.args.sourceinit[0], self.args.sourceinit[1]) elif self.args.source == "segmentcounter": if self.args.sourceinit is None: data_set = TFT.gen_segmented_vector_cases(25, 1000, 0, 8) else: data_set = TFT.gen_segmented_vector_cases(self.args.sourceinit[0], \ self.args.sourceinit[1], self.args.sourceinit[2], self.args.sourceinit[3]) elif self.args.source == "mnist": # mnist_basics.load_all_flat_cases(type='testing') cases = mnist_basics.load_all_flat_cases(type='training') input = cases[0] target = cases[1] input = list(map(lambda x: list(map(lambda e: e / 255, x)), input)) target = list(map(lambda x: TFT.int_to_one_hot(x, 10), target)) data_set = list(zip(input, target)) if data_set == []: print(self.args.source, " is illegal for argument --source") print("Legal values are: <filenme>.txt, parity, symmetry, \ auto_onehot, auto_dense, bitcounter, segmentcounter", sep="") quit() if self.args.source[-4:] == ".txt": data_set = normalize(data_set) return data_set
def labels_to_one_hot(labels): one_hot = [] for label in labels: for number in label: one_hot.append(tools.int_to_one_hot(number, 10)) return one_hot
def load_wine_dataset(): data = np.loadtxt('data/winequality_red.txt', delimiter=';') # targets are between 3 and 8. Offset left by three to use onehot-encoding return [[x[:11], TFT.int_to_one_hot(int(x[11]) - 3, 6)] for x in data]
def load_yeast_dataset(): data = np.loadtxt('data/yeast.txt', delimiter=',') # targets between 1 and 10 return [[x[:8], TFT.int_to_one_hot(int(x[8]) - 1, 10)] for x in data]