def read_bills_data(train_dir, queried_idxs=None): data_sets = DataSets() train_x, train_y, train_ids, test_x, test_y, test_ids = load_pickled_data( train_dir) try: np.shape(train_y)[1] except: train_y = dense_to_one_hot(train_y) test_y = dense_to_one_hot(test_y) if queried_idxs is not None: all_train_idxs = np.arange(len(train_y)) queried_docs = train_x[queried_idxs] queried_labels = train_y[queried_idxs] unqueried_idxs = np.setdiff1d(all_train_idxs, queried_idxs) remaining_docs = train_x[unqueried_idxs] remaining_labels = train_y[unqueried_idxs] data_sets.train = DataSet(queried_docs, queried_labels, queried_idxs) data_sets.unqueried = DataSet(remaining_docs, remaining_labels, unqueried_idxs) else: data_sets.train = DataSet(train_x, train_y, train_ids) data_sets.test = DataSet(test_x, test_y, test_ids) return data_sets
def loadDataSet(self,save_path): ''' input_data output_data ''' import os if os.path.exists(save_path): file=open(save_path,'r') output=[] num=0 for line in file: inputString=line[line.find("["):line.find("\n")] output.append(inputString) num+=1 #print line #print output input_data=[] output_data=[] for i in range(num/2): input_data.append(convertStringListToList(output[2*i])) output_data.append(convertStringListToList(output[2*i+1])) input_num=len(input_data[0]) output_num=len(output_data[0]) data=DataSet(input_num,output_num) for i in range(len(input_data)): data.addItem(input_data[i], output_data[i]) return data else: print(save_path+"does not exist")
def fit(self, text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame], category: Union[str, Iterable[str]] = None) -> TextClassifier: '''learn probabilities for tokens extracted by the given text''' data = DataSet.FromAny(text, category) categories = [] tokens = {} values = [] for d in data: categories.append((d.category, d.score)) for token in d.tokens: tokens[token] = 1 values.append((d.table, d.score)) self.total_documents += 1 tokens = list(tokens) self.__add_category(categories) self.__add_token(tokens) data_values = [[1 if t in v[0] else 0 for t in tokens] + [v[1]] for v in values] tokens.append(Data.CATEGORY_NAME) data_values = pd.DataFrame(data_values, columns=tokens) self.model.fit(data_values, Data.CATEGORY_NAME) return self
def load_unlabeled_corpus_111(train_dir): data_sets = DataSets() x, y, ids = load_pickled_corpus111_data(train_dir) try: np.shape(train_y)[1] except: y = dense_to_one_hot(y.astype(int)) data_sets.unlabeled = DataSet(x, y, ids) return data_sets
def read_data(base_dir, normalize=True, validation_size=5000, one_hot=False, flatten=True): check_data(base_dir) train_images = read_image(path.join(base_dir, TRAIN_IMAGES), flatten) test_images = read_image(path.join(base_dir, TEST_IMAGES), flatten) if normalize: train_images = train_images / 255.0 test_images = test_images / 255.0 train_labels = read_label(path.join(base_dir, TRAIN_LABELS), one_hot) validation = DataSet(images=train_images[:validation_size], labels=train_labels[:validation_size]) train = DataSet(images=train_images[validation_size:], labels=train_labels[validation_size:]) test_labels = read_label(path.join(base_dir, TEST_LABELS), one_hot) test = DataSet(images=test_images, labels=test_labels) return Datasets(train=train, validation=validation, test=test)
def load_unlabeled_bills(train_dir): data_sets = DataSets() train_x, train_y, train_ids, test_x, test_y, test_ids = load_pickled_data( train_dir) try: np.shape(train_y)[1] except: train_y = dense_to_one_hot(train_y.astype(int)) test_y = dense_to_one_hot(test_y.astype(int)) data_sets.unlabeled = DataSet(train_x, train_y, train_ids) #data_sets.unlabeled = DataSet(test_x, test_y, test_ids) """ data_sets.train = DataSet(train_x, train_y, train_ids) data_sets.test = DataSet(test_x, test_y, test_ids) """ return data_sets
def fit(self, text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame], category: Union[str, Iterable[str]]=None) -> NaiveBayes: '''learn probabilities for tokens extracted by the given text''' data = DataSet.FromAny(text, category) for d in data: # ensure we have defined the c category self.__add_category(d.category) # update our count of how many documents mapped to this category self.documents[d.category] += 1 # update the total number of documents we have learned from self.total_documents += 1 # Update our vocabulary and our word frequency count for this category for token, frequency in d.table.items(): # add this word to our vocabulary if not already existing self.__add_token(token) # update the frequency information for this word in this category if token not in self.word_frequency[d.category]: self.word_frequency[d.category][token] = frequency else: self.word_frequency[d.category][token] += frequency # update the count of all words we have seen mapped to this category self.word_count[d.category] += frequency return self
def loadNeuronBuilder(self,save_path): ''' input_layer=self.net.input_layer hidden_layers=self.net.hidden_layers output_layer=self.net.output_layer has_bias=self.net.has_bias layer_to_layer_weight=self.net.layer_to_layer_weight bias_to_layer=self.net.bias_to_neuron neuron_function=self.net.neuron_function neuron_dict=self.net.neuron_dict name_dict=self.net.name_dict input_data=self.net.input_data output_data=self.net.output_data ''' import os if save_path.find('.')==-1: save_path+='.nnet' if os.path.exists(save_path): file=open(save_path,'r') output=[] for lines in file: output.append(lines) #print output dic={} for line in output: # print line dic[line[0:line.find(':')]]=line[line.find(':')+1:line.find('\\')] #print dic file.close() input_layer=convertStringListToList(dic['input_layer']) for i in range(len(input_layer)): input_layer[i]=int(input_layer[i]) hidden_layers=convertStringListToList(dic['hidden_layers']) for i in range(len(hidden_layers)): hidden_layers[i]=int(hidden_layers[i]) output_layer=convertStringListToList(dic['output_layer']) for i in range(len(output_layer)): output_layer[i]=int(output_layer[i]) has_bias=dic['has_bias'] if has_bias=='True': has_bias=True elif has_bias=='False': has_bias=False else: print("Mistake occurred when load has_bias:"+has_bias) layer_to_layer_weight=convertStringDictToDict(dic['layer_to_layer_weight']) print dic['layer_to_layer_weight'] print layer_to_layer_weight for key in layer_to_layer_weight.keys(): layer_to_layer_weight[key]=convertStringListToList(layer_to_layer_weight[key]) bias_to_layer=convertStringDictToDict(dic['bias_to_layer']) print dic['bias_to_layer'] print bias_to_layer for key in bias_to_layer: bias_to_layer[key]=convertStringListToList(bias_to_layer[key]) neuron_function=convertStringDictToDict(dic['neuron_function']) neuron_dict=convertStringDictToDict(dic['neuron_dict']) name_dict=convertStringDictToDict(dic['name_dict']) input_data=convertStringListToList(dic['input_data']) output_data=convertStringListToList(dic['output_data']) net=NeuronBuilder(input_layer,hidden_layers,output_layer,has_bias) net.layer_to_layer_weight=layer_to_layer_weight net.updateNeuronToNeuronWeightByLayerToLayerWeight() net.bias_to_layer=bias_to_layer net.updateBiasToNeuronByBiasToLayer() net.neuron_function=neuron_function net.neuron_dict=neuron_dict net.name_dict=name_dict data=DataSet(len(input_data[0]),len(output_data[0])) for i in range(len(input_data)): data.addItem(input_data[i], output_data[i]) net.setDataSet(data) #print neural_network_type #print input_num #print output_num #print weight #print bias #print function_name return net else: print(save_path+"does not exist") return None
# -*- coding: utf-8 -*- # allow the notebook to access the parent directory so we can import the other modules # https://stackoverflow.com/a/35273613 import os import sys nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) dataset_folder_path = os.path.join("files", "dataset") #%% from data.DataSet import DataSet dataset = DataSet() dataset.load(dataset_folder_path, test_set_percentage=0.2, validation_set_percentage=0.3333) print("Training Data Len:", len(dataset.train_data)) print("Validation Data Len:", len(dataset.valid_data)) print("Test Data Len:", len(dataset.test_data)) #%% Load Model from keras.models import load_model TRAINED_MODEL = os.path.join("files", "checkpoints", "1525696834.4091375", "regularized_3x512_gru-30-0.97.hdf5") model = load_model(TRAINED_MODEL) #%%
net.setLayerFunction('out', 'purelin') net.setNeuronFunction('hid0_2', 'hardlim') print net.neuron_function net.setInputToInputLayer([1,0.6]) print net.getOutputFromLayer('hid1') print net.getOutputFromLayer('out') print net.getOutputFromOneNeuron('out0') net.showNetworkSimulation() ''' from neuron_build.neuron_build import NeuronBuilder net=NeuronBuilder([2],[3],[2],has_bias=True) net.connectTwoLayers('in', 'hid0') net.connectTwoLayers('hid0','out') from data.DataSet import DataSet data=DataSet(2,2) data.addItem([1,1], [-1,-1]) data.addItem([1,-1],[-1,1]) data.addItem([-1,1],[1,-1]) data.addItem([-1,-1],[1,1]) net.setDataSet(data) net.setLayerFunction('hid0', 'logsig') net.setLayerFunction('out','purelin') net.setInputToInputLayer([1,1]) print net.getOutputFromLayer('out') print net.neuron_function from trainer.DeltaTrainer import DeltaTrainer trainer=DeltaTrainer(net) trainer.setMaxError(0.001) trainer.DeltaTrain() #trainer.plotFigure()
# -*- coding: utf-8 -*- # allow the notebook to access the parent directory so we can import the other modules # https://stackoverflow.com/a/35273613 import os import sys nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) dataset_folder_path = os.path.join("files", "dataset") #%% from data.DataSet import DataSet dataset = DataSet() dataset.load(dataset_folder_path, test_set_percentage=0.2, validation_set_percentage=0.3333) print("Training Data Len:", len(dataset.train_data)) print("Validation Data Len:", len(dataset.valid_data)) print("Test Data Len:", len(dataset.test_data)) #%% import numpy as np from utils.preprocessing import * from functools import partial from utils.research_preprocessing import add_occlusions from models.regularized_3x512_gru import Regularized3x512GRU import os.path import pickle
# Constants PARAM_NUM_EPOCHS = 40 PARAM_BATCH_SIZE = 300 NUM_SAMPLES = 50 # Paths dataset_folder_path = os.path.join("files", "dataset") #%% Prepare Data # Imports from utils.preprocessing import * from data.DataSet import DataSet from functools import partial # Preprocessing dataset = DataSet() dataset.load(dataset_folder_path, test_set_percentage=0.1, validation_set_percentage=0.1) dataset.apply(apply_first_frame_centering) dataset.apply(apply_unit_distance_normalization) dataset.apply(partial(spline_interpolate_and_resample, num_samples=NUM_SAMPLES)) #%% # Create generative dataset from data.DataSetManipulator import DataSetManipulator manip = DataSetManipulator(dataset, sequence_length=NUM_SAMPLES) X_train, Y_train, X_valid, Y_valid, X_test, Y_test = manip.create_dataset_for_generative_models() #%% Use only one digit import numpy as np # train only 0s
# https://stackoverflow.com/a/35273613 import os import sys nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import os dataset_folder_path = os.path.join("files", "dataset") #%% from data.DataSet import DataSet dataset = DataSet() dataset.load(dataset_folder_path, test_set_percentage=0.2, validation_set_percentage=0.3333) print("Training Data Len:", len(dataset.train_data)) print("Validation Data Len:", len(dataset.valid_data)) print("Test Data Len:", len(dataset.test_data)) #%% NUM_SAMPLES = 50 ANGLES_TO_ROTATE = [5, 10, 15, 45, -5, -10, -15, -45] from utils.preprocessing import * from functools import partial
#encoding=utf-8 import numpy as np #import algo.knn.knn from data.DataSet import DataSet from data.DigitDataSet import DigitDataSet dataset = DataSet(file_path = "gaofeng_file") dataset2 = DataSet(folder_path = "gaofeng_folder") digit_dataset = DigitDataSet( file_path="gaofeng_file") digit_dataset2 = DigitDataSet(folder_path = "gaofeng_folder")
from tqdm import tqdm from models.NaiveBayes import NaiveBayes from models.PGMNaiveBayes import PGMNaiveBayes from data.DataSet import DataSet from functools import partial from time import time print('Collecting data (can take minutes)...') with tqdm() as bar: def update_bar(data, index, total): if total is not None: bar.total = total bar.update() data = DataSet.FromJSON('./dataset/data.json', './dataset/keywords.json', on_generate=update_bar) direct_data = data[lambda x: not not x.score] def train(Model, name, path, ds): print('Training Bayes Network (can take several minutes): {} - {}'.format( Model.__name__, name)) model = Model() start = time() model.fit(ds) stop = time() print('Trained in {:.3f}s'.format(stop - start)) print('Saving Bayes Network: {} - {} @ {}'.format(Model.__name__, name, path))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import utils.plot as plot from utils.preprocessing import * from data.DigitSet import DigitSet from data.DataSet import DataSet import os #%% folder = os.path.join("files", "dataset") dataset = DataSet(folder) dataset.apply(apply_mean_centering) dataset.apply(apply_unit_distance_normalization) dataset.apply(lambda x: normalize_pressure_value(x, 512)) #%% filename = os.path.join(folder, "10.43_23.03.2018_digitset.json") digitset = DigitSet(filename) scaled = digitset.copy() # Apply transformations scaled.apply(apply_mean_centering) scaled.apply(apply_unit_distance_normalization) scaled.apply(lambda x: normalize_pressure_value(x, 512)) if scaled.time_is_dt(): scaled.convert_dt_to_t() #%% digit, label = digitset[6] plot.show_digit(digit, label=label, show_lines=True, show_points=True,
#%% # Constants PARAM_NUM_EPOCHS = 15 PARAM_BATCH_SIZE = 300 NUM_SAMPLES = 50 # Paths dataset_folder_path = os.path.join("files", "dataset") #%% Prepare Data # Imports from utils.preprocessing import * from data.DataSet import DataSet from functools import partial import numpy as np dataset = DataSet() dataset.load(dataset_folder_path, test_set_percentage=0.333, validation_set_percentage=0) dataset.apply(apply_mean_centering) dataset.apply(apply_unit_distance_normalization) #dataset.apply(partial(normalize_pressure_value, max_pressure_val=512)) dataset.apply(partial(spline_interpolate_and_resample, num_samples=NUM_SAMPLES)) dataset.expand_many(partial(rotate_digit, degrees=[5, 10, 15, 45, -5, -10, -15, -45])) dataset.expand(reverse_digit_sequence) # dataset.apply(lambda digit: convert_xy_to_derivative(digit, normalize=False)) #dataset.apply(partial(convert_xy_to_derivative, normalize=True)) #%% Split Train, Valid, Test # Imports import numpy as np from sklearn.model_selection import train_test_split
# -*- coding: utf-8 -*- # allow the notebook to access the parent directory so we can import the other modules # https://stackoverflow.com/a/35273613 import os import sys nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) import os dataset_folder_path = os.path.join("files", "dataset") #%% from data.DataSet import DataSet dataset = DataSet() dataset.load(dataset_folder_path, test_set_percentage=0.2, validation_set_percentage=0.3333) print("Training Data Len:", len(dataset.train_data)) print("Validation Data Len:", len(dataset.valid_data)) print("Test Data Len:", len(dataset.test_data)) #%% import matplotlib.pyplot as plt import numpy as np DIGIT_IDX = 1627 #DIGIT_IDX = 326 digit = dataset.train_data[DIGIT_IDX] #%% create original digit images padding = 10