def load_data(dataset): etl = ETL(DATA_PATH, [128, 256, 512, 1024], sma_window=3, minimal_movement=0.75) etl.load(dataset) etl.preprocess_pooled() etl.generate_fourier_dataset(window_overlap=1)
def main(): """ Main function to run Logistic Regression/Adaline Regression """ # Parse arguments arguments = args() # Set up kwargs for ETL kwargs = { 'data_name': arguments.data_name, 'random_state': arguments.random_state } etl = ETL(**kwargs) # Set up kwargs and create object kwargs = {'etl': etl, 'step_size': arguments.step_size} if arguments.adaline: model = AdalineRegressor(**kwargs) else: model = LogisticRegressor(**kwargs) # Tune if arguments.tune: model.tune() # Fit model.fit() # Predict model.predict() # Summarize model.summarize()
def generate_fourier(data_path, window_sizes, size, params): etl = ETL( data_path=data_path, window_sizes=window_sizes, sma_window=params["sma"], minimal_movement=params["minimal_movement"], size=size ) etl.load("CIMA") print("\nPreprocessing data.") etl.preprocess_pooled() print("\nGenerating fourier data.") etl.generate_fourier_dataset(window_overlap=params["window_overlap"])
def cv(model_name): kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) angles = [ "right_shoulder", "left_shoulder", "right_elbow", "left_elbow", "right_hip", "left_hip", "right_knee", "left_knee" ] window_sizes = [128, 256, 512, 1024] etl = ETL(DATA_PATH, [128, 256, 512, 1024], sma_window=3, minimal_movement=0.75) etl.load("CIMA") infants = np.array(list(etl.cima.keys())) labels = np.array([etl.cima[infant]["label"] for infant in infants]) etl.preprocess_pooled() etl.generate_fourier_dataset(window_overlap=1) X = pd.DataFrame() for train_index, test_index in kf.split(infants, labels): ids = infants[train_index] id_hash = f"{model_name}_{sha1(ids).hexdigest()[:5]}" model_path = f"saved_models/{id_hash}.joblib" if os.path.exists(model_path): models = joblib.load(model_path) else: models = {} for window_size in window_sizes: for angle in angles: fourier_path = os.path.join(DATA_PATH, str(window_size), angle + ".json") df = pd.read_json(fourier_path) X = X.append(df) X = X[X.id.isin(ids)] y = X["label"] X = pd.DataFrame(X.data.tolist()) # model_name = f"{window_size}_{model_name}" models[window_size] = train_model(model_name, X, y, save=False) joblib.dump(models, model_path) x_test = infants[test_index] y_test = labels[test_index] score = evaluate_model(id_hash, models, x_test, y_test)
def predict(self, data_path, infant_id): if self.verbose: print( f"Predicting infant {infant_id} - {strftime('%H:%M:%S', gmtime())}" ) window_sizes = [128, 256, 512, 1024] etl = ETL(data_path, window_sizes, pooling="mean", sma_window=3, bandwidth=0, minimal_movement=0.75) etl.load_infant(infant_id) if self.verbose: print(f"Preprocessing the data - {strftime('%H:%M:%S', gmtime())}") etl.preprocess_pooled() angles = [ "right_shoulder", "left_shoulder", "right_elbow", "left_elbow", "right_hip", "left_hip", "right_knee", "left_knee" ] predictions = {} video_length = len(etl.cima[infant_id]["data"]) prediction = Prediction(video_length) for angle in angles: predictions[angle] = pd.Series( [[] for i in range(len(etl.cima[infant_id]["data"]))]) if self.verbose: print( f"Generating fourier data - {strftime('%H:%M:%S', gmtime())}") for window_size in window_sizes: for angle in angles: dataframe = etl.generate_fourier_data(angle, window_size, window_size // 4) data_features = pd.DataFrame(dataframe.data.tolist()) if not data_features.empty: data_transformed = self.model[window_size][ "pls"].transform(data_features) dataframe["label"] = self.model[window_size][ "model"].predict_proba(data_transformed) else: dataframe["label"] = pd.Series([]) prediction.set_window_data(window_size, angle, dataframe) infant = etl.cima[infant_id] infant["predictions"] = prediction return infant, prediction
def main(): """ Main function to run Neural Network """ # Parse arguments arguments = args() # Set up kwargs for ETL kwargs = { 'data_name': arguments.data_name, 'random_state': arguments.random_state } etl = ETL(**kwargs) # Set up kwargs and create object kwargs = { 'etl': etl, 'hidden_layers_count': arguments.hidden_layers_count, 'step_size': arguments.step_size, 'node_count': arguments.node_count, 'convergence_threshold': arguments.convergence_threshold, 'random_state': arguments.random_state } model = NeuralNetwork(**kwargs) # Tune if arguments.tune: if arguments.tune not in ('s', 'n', 'c'): raise ValueError( 'Please pass s, n, or c to tune the corresponding parameter') model.tune(arguments.tune) else: # Fit model.fit() # Predict model.predict() # Summarize model.summarize()
def main(): """ Main function to run Decision Tree Classifier/Regressor """ # Parse arguments arguments = args() # Set up kwargs for ETL kwargs = { 'data_name': arguments.data_name, 'random_state': arguments.random_state } etl = ETL(**kwargs) # Decision Tree # Classification if arguments.data_name in ['breast-cancer', 'car', 'segmentation']: # Set up kwargs kwargs = {'etl': etl, 'prune': arguments.prune} dt_model = ID3Classifier(**kwargs) # Regression else: # Set up kwargs kwargs = {'etl': etl, 'percent_threshold': arguments.percent_threshold} dt_model = CARTRegressor(**kwargs) # Tune if arguments.tune: dt_model.tune() # Fit dt_model.fit() # Predict dt_model.predict() # Summarize dt_model.summarize()
class TestETL(unittest.TestCase): def setUp(self) -> None: self.processor = ETL() self.test_file_name = 'lake/2019-02-15.tsv' def test_get_list_of_files_from_s3(self): list_of_files = self.processor.get_list_of_files_from_s3() print(list_of_files) assert isinstance(list_of_files, list) assert len(list_of_files) > 0 assert len([i for i in list_of_files if not str(i).endswith('tsv')]) == 0 def test_extract(self): local_path = self.processor.extract('lake/2019-02-15.tsv') assert local_path == self.test_file_name.split('/')[1] def test_transform(self): if len(self.processor.df) == 0: self.processor.extract(self.test_file_name) self.processor.transform() assert hasattr(self.processor, 'article_performance_df') assert hasattr(self.processor, 'user_performance_df') assert len(self.processor.article_performance_df) > 0 assert len(self.processor.article_performance_df) == len(self.processor.article_performance_df) def test_download_file(self): local_path = self.processor.download_file(self.test_file_name) assert local_path in os.listdir() def tearDown(self) -> None: for file in [i for i in os.listdir() if i.endswith('.tsv')]: os.remove(file)
from etl.etl import ETL from matplotlib import pyplot as plt etl = ETL("/home/erlend/datasets", [128, 256, 512, 1024], size=16, random_seed=42) etl.cache = False etl.load("CIMA") infant = etl.cima["077"] infant = etl.resample(infant) before_sma = infant["data"]["right_wrist_x"][:250] etl.preprocess_pooled() after_sma = etl.cima["077"]["data"]["right_wrist_x"][:250] fig = plt.Figure() plt.plot(before_sma, color="red", alpha=0.5) plt.plot(after_sma, color="green", alpha=0.5) plt.xlabel("Frame") plt.ylabel("right_wrist_x") plt.legend(["Raw data", "SMA=3"]) plt.savefig("sma.png")
from etl.etl import ETL import sys import logging import yaml from logging.config import dictConfig dictConfig(yaml.safe_load(open('etl/logging.yaml'))) log = logging.getLogger() if __name__ == '__main__': config_path = sys.argv[1] log.info("The config path is %s" % config_path) etl_process = ETL() etl_process.initialize(config_path) etl_process.run()
from time import sleep from etl.etl import ETL if __name__ == '__main__': sleep(10) processor = ETL() processor.run()
from etl.etl import ETL from tqdm import tqdm import numpy as np from matplotlib import pyplot as plt from matplotlib import colors from matplotlib.ticker import PercentFormatter window_sizes = [128, 256, 512, 1024] etl = ETL("/home/erlend/datasets", window_sizes) etl.load("CIMA") etl.preprocess_pooled() angles = etl.angles.keys() differences = {} for window_size in tqdm(window_sizes): etl.differences = [] for angle in angles: etl.generate_fourier_data(angle, window_size, window_size) differences[window_size] = etl.differences bins = [0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5] for window_size in window_sizes: fig = plt.figure() plt.hist(differences[window_size], color="mediumslateblue", bins=bins, weights=np.ones(len(differences[window_size])) / len(differences[window_size]), edgecolor="black", linewidth=0.5)
from etl.etl import ETL import sys import logging import yaml from logging.config import dictConfig dictConfig(yaml.safe_load(open('etl/logging.yaml'))) log = logging.getLogger() if __name__ == '__main__': config_path = sys.argv[1] log.info("The config path is %s" % config_path) etl_process = ETL(simulate="simulate" in sys.argv) etl_process.initialize(config_path) etl_process.run()
def load_validation_set(data_path): etl = ETL(data_path, [128, 256, 512, 1024]) etl.load("CIMA", validation=True) etl.preprocess_pooled() return etl.cima
from etl.etl import ETL from etl.mongo import MongoETL from etl.neo import NeoETL if __name__ == '__main__': ETL.extract() for task in [MongoETL(), NeoETL()]: task.run()
def modeling(): options = [{ 'title': 'iris', 'id': 1, }] print(options) elements = [ { 'title': 'Network layers number', 'id': 'layers_n', 'type': '', 'default': 1, }, { 'title': 'Neuron number', 'id': 'nn', 'type': '', 'default': 10, }, { 'title': 'Activation functions list', 'id': 'func', 'type': '', 'default': 'sigmoid', }, { 'title': 'Metrics', 'id': 'metrics', 'type': '', 'default': 'accuracy', }, { 'title': 'Loss', 'id': 'loss', 'type': '', 'default': 'categorical_crossentropy', }, { 'title': 'Epoch number', 'id': 'ep', 'type': '', 'default': '100', }, { 'title': 'Datasets', 'id': 'dataset', 'type': '', 'options': options, 'default': options[0]['id'], }, ] return_url = "/" if request.args.get('result'): dataset_to_comps = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ] # more tables??? model_info, datasets = get_model_info(base, request.args.get('models')) neurons = request.args.get('nn').split(',') input_dim = [len(dataset_to_comps)] + [0] * (len(neurons) - 1) activation = request.args.get('func').split(',') etl = ETL(manager=base) load_data_instr = {"category_name": 'Iris Fisher'} path = 'local_files/iris.csv' etl.load_supervised_data(path=path, ctg_name=load_data_instr["category_name"]) # x1 = base.get_raw_data(RateName=dataset_to_comps[0]) # x1 = pd.DataFrame(x1[2].float_value) # x2 = base.get_raw_data(RateName=dataset_to_comps[1]) # x2 = pd.DataFrame(x2[2].float_value) # x3 = base.get_raw_data(RateName=dataset_to_comps[2]) # x3 = pd.DataFrame(x3[2].float_value) # x4 = base.get_raw_data(RateName=dataset_to_comps[3]) # x4 = pd.DataFrame(x4[2].float_value) X = pd.read_csv(path) y = X['species'] X = X.drop('species', axis=1) X = X.as_matrix() train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7, random_state=42) train_y_ohe = np.array(get_dummies(train_y), dtype=np.float64) test_y_ohe = np.array(get_dummies(test_y), dtype=np.float64) # build_args = { # 'build_args': [ # {'neurons': neurons[i], 'input_dim': input_dim[i], 'activation': activation[i], 'init': 'normal'} for i in range(len(neurons)) ## {'neurons' : 16, 'input_dim' : 4, 'init' : 'normal', 'activation' : 'relu'}, ## {'neurons' : 3, 'input_dim' : 0, 'init' : 'normal', 'activation' : 'sigmoid'} # ], # 'compile_args': { # 'loss': request.args.get('loss'), # 'optimizer': 'adam', # 'metrics': request.args.get('metrics') # } # } # compile_args = { # 'loss': request.args.get('loss'), # 'optimizer': 'adam', # 'metrics': request.args.get('metrics') # } # fit_args = {'nb_epoch': request.args.get('ep'), 'batch_size': 1, 'verbose': 0} # evaluate_args = {'verbose': 0} # predict_args = {} build_args = { 'build_args': [{ 'neurons': 16, 'input_dim': 4, 'init': 'normal', 'activation': 'relu' }, { 'neurons': 3, 'input_dim': 0, 'init': 'normal', 'activation': 'sigmoid' }], 'compile_args': { 'loss': 'categorical_crossentropy', 'optimizer': 'adam', 'metrics': 'accuracy' } } compile_args = { 'loss': 'categorical_crossentropy', 'optimizer': 'adam', 'metrics': 'accuracy' } fit_args = {'epochs': 100, 'batch_size': 1, 'verbose': 1} evaluate_args = {'verbose': 0} predict_args = {} print(build_args) m = KerasClassifier(name='iris', args=build_args) history = m.fit(train_X, train_y_ohe, fit_args=fit_args) loss, accuracy = m.evaluate(test_X, test_y_ohe, evaluate_args) prediction = m.predict(train_X) loss_data = history.history['loss'][1:] return render_template("modeling.html", elements=elements, return_url=return_url, loss=request.args.get('loss'), loss_data=list( zip(list(range(len(loss_data) - 1)), loss_data))) else: return render_template( "input.html", elements=elements, return_url=return_url, )
from etl.etl import ETL from etl.quandl import Quandl as q from sklearn.model_selection import train_test_split from pandas import get_dummies import numpy as np import pandas as pd import datetime from models.models import * if __name__ == '__main__': # Connection to DataBase and assemble Scheme DB = DBManager() etl = ETL(manager=DB) ##### LOADING DATA FROM VARIOUS SOURCES # Download local files for superviesd learning load_data_instr = {"category_name": 'Iris Fisher'} etl.load_supervised_data(path='local_files/iris.csv', ctg_name=load_data_instr["category_name"]) # Define categories for JapanExchange_Derivatives_ex2 cats = [Category(name='futures', description='azaza'), Category(name='call', description='azaza'), Category(name='put', description='azaza'), Category(name='cbr', description='azaza')] DB.session.add_all(cats) # Import Future Data
def setUp(self) -> None: self.processor = ETL() self.test_file_name = 'lake/2019-02-15.tsv'
from manager.dbmanager import DBManager from etl.etl import ETL manager = DBManager() etl = ETL(manager=manager) etl.get_Kospi_data_ex1("local_files/kospi.xlsx")
def load_infant(data_path): window_sizes = [128, 256, 512, 1024] etl = ETL(data_path, window_sizes)