def read_bills_data(train_dir, queried_idxs=None):
    data_sets = DataSets()
    train_x, train_y, train_ids, test_x, test_y, test_ids = load_pickled_data(
        train_dir)
    try:
        np.shape(train_y)[1]
    except:
        train_y = dense_to_one_hot(train_y)
        test_y = dense_to_one_hot(test_y)
    if queried_idxs is not None:
        all_train_idxs = np.arange(len(train_y))
        queried_docs = train_x[queried_idxs]
        queried_labels = train_y[queried_idxs]
        unqueried_idxs = np.setdiff1d(all_train_idxs, queried_idxs)

        remaining_docs = train_x[unqueried_idxs]
        remaining_labels = train_y[unqueried_idxs]

        data_sets.train = DataSet(queried_docs, queried_labels, queried_idxs)
        data_sets.unqueried = DataSet(remaining_docs, remaining_labels,
                                      unqueried_idxs)
    else:
        data_sets.train = DataSet(train_x, train_y, train_ids)
    data_sets.test = DataSet(test_x, test_y, test_ids)

    return data_sets
コード例 #2
0
ファイル: save_load_tools.py プロジェクト: shd101wyy/PyNeuron
 def loadDataSet(self,save_path):
     '''
     input_data
     output_data
     
     
     '''
     import os
     if os.path.exists(save_path):
         file=open(save_path,'r')
         output=[]
         num=0
         for line in file:
             inputString=line[line.find("["):line.find("\n")]
             output.append(inputString)
             num+=1
             #print line
         #print output
         input_data=[]
         output_data=[]
         for i in range(num/2):
             input_data.append(convertStringListToList(output[2*i]))
             output_data.append(convertStringListToList(output[2*i+1]))
         input_num=len(input_data[0])
         output_num=len(output_data[0])
         data=DataSet(input_num,output_num)
         for i in range(len(input_data)):
             data.addItem(input_data[i], output_data[i])
         return data
     else:
         print(save_path+"does not exist")
コード例 #3
0
    def fit(self,
            text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame],
            category: Union[str, Iterable[str]] = None) -> TextClassifier:
        '''learn probabilities for tokens extracted by the given text'''
        data = DataSet.FromAny(text, category)

        categories = []
        tokens = {}
        values = []

        for d in data:
            categories.append((d.category, d.score))
            for token in d.tokens:
                tokens[token] = 1
            values.append((d.table, d.score))
            self.total_documents += 1

        tokens = list(tokens)
        self.__add_category(categories)
        self.__add_token(tokens)

        data_values = [[1 if t in v[0] else 0 for t in tokens] + [v[1]]
                       for v in values]

        tokens.append(Data.CATEGORY_NAME)

        data_values = pd.DataFrame(data_values, columns=tokens)

        self.model.fit(data_values, Data.CATEGORY_NAME)

        return self
def load_unlabeled_corpus_111(train_dir):
    data_sets = DataSets()
    x, y, ids = load_pickled_corpus111_data(train_dir)
    try:
        np.shape(train_y)[1]
    except:
        y = dense_to_one_hot(y.astype(int))
    data_sets.unlabeled = DataSet(x, y, ids)
    return data_sets
コード例 #5
0
ファイル: mnist.py プロジェクト: nex3z/deep-learning-scratch
def read_data(base_dir,
              normalize=True,
              validation_size=5000,
              one_hot=False,
              flatten=True):
    check_data(base_dir)

    train_images = read_image(path.join(base_dir, TRAIN_IMAGES), flatten)
    test_images = read_image(path.join(base_dir, TEST_IMAGES), flatten)

    if normalize:
        train_images = train_images / 255.0
        test_images = test_images / 255.0

    train_labels = read_label(path.join(base_dir, TRAIN_LABELS), one_hot)
    validation = DataSet(images=train_images[:validation_size],
                         labels=train_labels[:validation_size])
    train = DataSet(images=train_images[validation_size:],
                    labels=train_labels[validation_size:])

    test_labels = read_label(path.join(base_dir, TEST_LABELS), one_hot)
    test = DataSet(images=test_images, labels=test_labels)

    return Datasets(train=train, validation=validation, test=test)
def load_unlabeled_bills(train_dir):
    data_sets = DataSets()
    train_x, train_y, train_ids, test_x, test_y, test_ids = load_pickled_data(
        train_dir)
    try:
        np.shape(train_y)[1]
    except:
        train_y = dense_to_one_hot(train_y.astype(int))
        test_y = dense_to_one_hot(test_y.astype(int))
    data_sets.unlabeled = DataSet(train_x, train_y, train_ids)
    #data_sets.unlabeled = DataSet(test_x, test_y, test_ids)
    """
  data_sets.train = DataSet(train_x, train_y, train_ids)
  data_sets.test  = DataSet(test_x,  test_y,  test_ids)
  """
    return data_sets
コード例 #7
0
    def fit(self, text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame], category: Union[str, Iterable[str]]=None) -> NaiveBayes:
        '''learn probabilities for tokens extracted by the given text'''
        data = DataSet.FromAny(text, category)
        for d in data:
            # ensure we have defined the c category
            self.__add_category(d.category)
            # update our count of how many documents mapped to this category
            self.documents[d.category] += 1
            # update the total number of documents we have learned from
            self.total_documents += 1

            # Update our vocabulary and our word frequency count for this category
            for token, frequency in d.table.items():
                # add this word to our vocabulary if not already existing
                self.__add_token(token)

                # update the frequency information for this word in this category
                if token not in self.word_frequency[d.category]: self.word_frequency[d.category][token] = frequency
                else: self.word_frequency[d.category][token] += frequency
            
                # update the count of all words we have seen mapped to this category
                self.word_count[d.category] += frequency

        return self
コード例 #8
0
ファイル: save_load_tools.py プロジェクト: shd101wyy/PyNeuron
 def loadNeuronBuilder(self,save_path):
     '''
     input_layer=self.net.input_layer
     hidden_layers=self.net.hidden_layers
     output_layer=self.net.output_layer
     has_bias=self.net.has_bias
     layer_to_layer_weight=self.net.layer_to_layer_weight
     bias_to_layer=self.net.bias_to_neuron
     neuron_function=self.net.neuron_function
     neuron_dict=self.net.neuron_dict
     name_dict=self.net.name_dict
     input_data=self.net.input_data
     output_data=self.net.output_data
     '''
     import os
     if save_path.find('.')==-1:
         save_path+='.nnet'
     if os.path.exists(save_path):
         file=open(save_path,'r')
         output=[]
         for lines in file:
             output.append(lines)
         #print output
         dic={}
         for line in output:
         #    print line
             dic[line[0:line.find(':')]]=line[line.find(':')+1:line.find('\\')]
         #print dic
         file.close()
         input_layer=convertStringListToList(dic['input_layer'])
         for i in range(len(input_layer)):
             input_layer[i]=int(input_layer[i])
             
         hidden_layers=convertStringListToList(dic['hidden_layers'])
         for i in range(len(hidden_layers)):
             hidden_layers[i]=int(hidden_layers[i])
             
         output_layer=convertStringListToList(dic['output_layer'])
         for i in range(len(output_layer)):
             output_layer[i]=int(output_layer[i])
         has_bias=dic['has_bias']
         if has_bias=='True':
             has_bias=True
         elif has_bias=='False':
             has_bias=False
         else:
             print("Mistake occurred when load has_bias:"+has_bias)
             
         layer_to_layer_weight=convertStringDictToDict(dic['layer_to_layer_weight'])
         print dic['layer_to_layer_weight']
         print layer_to_layer_weight
         for key in layer_to_layer_weight.keys():
             layer_to_layer_weight[key]=convertStringListToList(layer_to_layer_weight[key])
         
         bias_to_layer=convertStringDictToDict(dic['bias_to_layer'])
         print dic['bias_to_layer']
         print bias_to_layer
         for key in bias_to_layer:
             bias_to_layer[key]=convertStringListToList(bias_to_layer[key])
     
         neuron_function=convertStringDictToDict(dic['neuron_function'])
         neuron_dict=convertStringDictToDict(dic['neuron_dict'])
         name_dict=convertStringDictToDict(dic['name_dict'])
         input_data=convertStringListToList(dic['input_data'])
         output_data=convertStringListToList(dic['output_data'])
         
         net=NeuronBuilder(input_layer,hidden_layers,output_layer,has_bias)
         net.layer_to_layer_weight=layer_to_layer_weight
         net.updateNeuronToNeuronWeightByLayerToLayerWeight()
         net.bias_to_layer=bias_to_layer
         net.updateBiasToNeuronByBiasToLayer()
         net.neuron_function=neuron_function
         net.neuron_dict=neuron_dict
         net.name_dict=name_dict
         
         data=DataSet(len(input_data[0]),len(output_data[0]))
         for i in range(len(input_data)):
             data.addItem(input_data[i], output_data[i])
         net.setDataSet(data)
         
         #print neural_network_type
         #print input_num
         #print output_num
         #print weight
         #print bias
         #print function_name
         return net
     else:
         print(save_path+"does not exist")
         return None
コード例 #9
0
# -*- coding: utf-8 -*-

# allow the notebook to access the parent directory so we can import the other modules
# https://stackoverflow.com/a/35273613
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

dataset_folder_path = os.path.join("files", "dataset")

#%%
from data.DataSet import DataSet
dataset = DataSet()
dataset.load(dataset_folder_path,
             test_set_percentage=0.2,
             validation_set_percentage=0.3333)

print("Training Data Len:", len(dataset.train_data))
print("Validation Data Len:", len(dataset.valid_data))
print("Test Data Len:", len(dataset.test_data))

#%% Load Model
from keras.models import load_model

TRAINED_MODEL = os.path.join("files", "checkpoints", "1525696834.4091375",
                             "regularized_3x512_gru-30-0.97.hdf5")
model = load_model(TRAINED_MODEL)

#%%
コード例 #10
0
ファイル: Practice.py プロジェクト: shd101wyy/PyNeuron
net.setLayerFunction('out', 'purelin')
net.setNeuronFunction('hid0_2', 'hardlim')
print net.neuron_function
net.setInputToInputLayer([1,0.6])
print net.getOutputFromLayer('hid1')
print net.getOutputFromLayer('out')
print net.getOutputFromOneNeuron('out0')
net.showNetworkSimulation()
'''

from neuron_build.neuron_build import NeuronBuilder
net=NeuronBuilder([2],[3],[2],has_bias=True)
net.connectTwoLayers('in', 'hid0')
net.connectTwoLayers('hid0','out')
from data.DataSet import DataSet
data=DataSet(2,2)
data.addItem([1,1], [-1,-1])
data.addItem([1,-1],[-1,1])
data.addItem([-1,1],[1,-1])
data.addItem([-1,-1],[1,1])
net.setDataSet(data)
net.setLayerFunction('hid0', 'logsig')
net.setLayerFunction('out','purelin')
net.setInputToInputLayer([1,1])
print net.getOutputFromLayer('out')
print net.neuron_function
from trainer.DeltaTrainer import DeltaTrainer
trainer=DeltaTrainer(net)
trainer.setMaxError(0.001)
trainer.DeltaTrain()
#trainer.plotFigure()
コード例 #11
0
# -*- coding: utf-8 -*-

# allow the notebook to access the parent directory so we can import the other modules
# https://stackoverflow.com/a/35273613
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

dataset_folder_path = os.path.join("files", "dataset")

#%%
from data.DataSet import DataSet
dataset = DataSet()
dataset.load(dataset_folder_path,
             test_set_percentage=0.2,
             validation_set_percentage=0.3333)

print("Training Data Len:", len(dataset.train_data))
print("Validation Data Len:", len(dataset.valid_data))
print("Test Data Len:", len(dataset.test_data))

#%%
import numpy as np
from utils.preprocessing import *
from functools import partial
from utils.research_preprocessing import add_occlusions
from models.regularized_3x512_gru import Regularized3x512GRU
import os.path
import pickle
コード例 #12
0
# Constants
PARAM_NUM_EPOCHS = 40
PARAM_BATCH_SIZE = 300
NUM_SAMPLES = 50

# Paths
dataset_folder_path = os.path.join("files", "dataset")

#%% Prepare Data
# Imports
from utils.preprocessing import *
from data.DataSet import DataSet
from functools import partial

# Preprocessing
dataset = DataSet()
dataset.load(dataset_folder_path, test_set_percentage=0.1, validation_set_percentage=0.1)
dataset.apply(apply_first_frame_centering)
dataset.apply(apply_unit_distance_normalization)
dataset.apply(partial(spline_interpolate_and_resample, num_samples=NUM_SAMPLES))

#%%
# Create generative dataset
from data.DataSetManipulator import DataSetManipulator

manip = DataSetManipulator(dataset, sequence_length=NUM_SAMPLES)
X_train, Y_train, X_valid, Y_valid, X_test, Y_test = manip.create_dataset_for_generative_models()

#%% Use only one digit
import numpy as np
# train only 0s
コード例 #13
0
# https://stackoverflow.com/a/35273613
import os
import sys

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import os

dataset_folder_path = os.path.join("files", "dataset")

#%%
from data.DataSet import DataSet

dataset = DataSet()
dataset.load(dataset_folder_path,
             test_set_percentage=0.2,
             validation_set_percentage=0.3333)

print("Training Data Len:", len(dataset.train_data))
print("Validation Data Len:", len(dataset.valid_data))
print("Test Data Len:", len(dataset.test_data))

#%%
NUM_SAMPLES = 50
ANGLES_TO_ROTATE = [5, 10, 15, 45, -5, -10, -15, -45]

from utils.preprocessing import *
from functools import partial
コード例 #14
0
ファイル: main.py プロジェクト: zuiwufenghua/PythonicML
#encoding=utf-8
import numpy as np
#import algo.knn.knn
from data.DataSet import DataSet
from data.DigitDataSet import DigitDataSet

dataset = DataSet(file_path = "gaofeng_file")
dataset2 = DataSet(folder_path = "gaofeng_folder")

digit_dataset = DigitDataSet(
                file_path="gaofeng_file")
digit_dataset2 = DigitDataSet(folder_path = "gaofeng_folder")



    
コード例 #15
0
from tqdm import tqdm
from models.NaiveBayes import NaiveBayes
from models.PGMNaiveBayes import PGMNaiveBayes
from data.DataSet import DataSet
from functools import partial
from time import time

print('Collecting data (can take minutes)...')
with tqdm() as bar:

    def update_bar(data, index, total):
        if total is not None: bar.total = total
        bar.update()

    data = DataSet.FromJSON('./dataset/data.json',
                            './dataset/keywords.json',
                            on_generate=update_bar)

direct_data = data[lambda x: not not x.score]


def train(Model, name, path, ds):
    print('Training Bayes Network (can take several minutes): {} - {}'.format(
        Model.__name__, name))
    model = Model()
    start = time()
    model.fit(ds)
    stop = time()
    print('Trained in {:.3f}s'.format(stop - start))
    print('Saving Bayes Network: {} - {} @ {}'.format(Model.__name__, name,
                                                      path))
コード例 #16
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import utils.plot as plot
from utils.preprocessing import *
from data.DigitSet import DigitSet
from data.DataSet import DataSet
import os

#%%
folder = os.path.join("files", "dataset")
dataset = DataSet(folder)
dataset.apply(apply_mean_centering)
dataset.apply(apply_unit_distance_normalization)
dataset.apply(lambda x: normalize_pressure_value(x, 512))

#%%
filename = os.path.join(folder, "10.43_23.03.2018_digitset.json")
digitset = DigitSet(filename)
scaled = digitset.copy()
# Apply transformations
scaled.apply(apply_mean_centering)
scaled.apply(apply_unit_distance_normalization)
scaled.apply(lambda x: normalize_pressure_value(x, 512))
if scaled.time_is_dt():
    scaled.convert_dt_to_t()

#%%
digit, label = digitset[6]
plot.show_digit(digit, label=label, 
                show_lines=True, show_points=True, 
コード例 #17
0
#%%
# Constants
PARAM_NUM_EPOCHS = 15
PARAM_BATCH_SIZE = 300
NUM_SAMPLES = 50

# Paths
dataset_folder_path = os.path.join("files", "dataset")
#%% Prepare Data
# Imports
from utils.preprocessing import *
from data.DataSet import DataSet
from functools import partial
import numpy as np

dataset = DataSet()
dataset.load(dataset_folder_path, test_set_percentage=0.333, validation_set_percentage=0)
dataset.apply(apply_mean_centering)
dataset.apply(apply_unit_distance_normalization)
#dataset.apply(partial(normalize_pressure_value, max_pressure_val=512))
dataset.apply(partial(spline_interpolate_and_resample, num_samples=NUM_SAMPLES))
dataset.expand_many(partial(rotate_digit, degrees=[5, 10, 15, 45, -5, -10, -15, -45]))
dataset.expand(reverse_digit_sequence)
# dataset.apply(lambda digit: convert_xy_to_derivative(digit, normalize=False))
#dataset.apply(partial(convert_xy_to_derivative, normalize=True))

#%% Split Train, Valid, Test
# Imports
import numpy as np
from sklearn.model_selection import train_test_split
コード例 #18
0
# -*- coding: utf-8 -*-

# allow the notebook to access the parent directory so we can import the other modules
# https://stackoverflow.com/a/35273613
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
import os
dataset_folder_path = os.path.join("files", "dataset")

#%%
from data.DataSet import DataSet
dataset = DataSet()
dataset.load(dataset_folder_path, test_set_percentage=0.2, validation_set_percentage=0.3333)

print("Training Data Len:", len(dataset.train_data))
print("Validation Data Len:", len(dataset.valid_data))
print("Test Data Len:", len(dataset.test_data))

#%%
import matplotlib.pyplot as plt
import numpy as np
DIGIT_IDX = 1627
#DIGIT_IDX = 326
digit = dataset.train_data[DIGIT_IDX]

#%% create original digit images
padding = 10