Python ETL Exemples, etl.etl.ETL Python Exemples

Exemple #1

0

Afficher le fichier

def load_data(dataset):
    etl = ETL(DATA_PATH, [128, 256, 512, 1024],
              sma_window=3,
              minimal_movement=0.75)
    etl.load(dataset)
    etl.preprocess_pooled()
    etl.generate_fourier_dataset(window_overlap=1)

Exemple #2

0

Afficher le fichier

Fichier : main.py Projet : cuthai/ML_Assignment4

def main():
    """
    Main function to run Logistic Regression/Adaline Regression
    """
    # Parse arguments
    arguments = args()

    # Set up kwargs for ETL
    kwargs = {
        'data_name': arguments.data_name,
        'random_state': arguments.random_state
    }
    etl = ETL(**kwargs)

    # Set up kwargs and create object
    kwargs = {'etl': etl, 'step_size': arguments.step_size}
    if arguments.adaline:
        model = AdalineRegressor(**kwargs)
    else:
        model = LogisticRegressor(**kwargs)

    # Tune
    if arguments.tune:
        model.tune()

    # Fit
    model.fit()

    # Predict
    model.predict()

    # Summarize
    model.summarize()

Exemple #3

0

Afficher le fichier

def generate_fourier(data_path, window_sizes, size, params):
    etl = ETL(
        data_path=data_path,
        window_sizes=window_sizes,
        sma_window=params["sma"],
        minimal_movement=params["minimal_movement"],
        size=size
    )
    etl.load("CIMA")
    print("\nPreprocessing data.")
    etl.preprocess_pooled()
    print("\nGenerating fourier data.")
    etl.generate_fourier_dataset(window_overlap=params["window_overlap"])

Exemple #4

0

Afficher le fichier

def cv(model_name):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    angles = [
        "right_shoulder", "left_shoulder", "right_elbow", "left_elbow",
        "right_hip", "left_hip", "right_knee", "left_knee"
    ]
    window_sizes = [128, 256, 512, 1024]

    etl = ETL(DATA_PATH, [128, 256, 512, 1024],
              sma_window=3,
              minimal_movement=0.75)

    etl.load("CIMA")

    infants = np.array(list(etl.cima.keys()))
    labels = np.array([etl.cima[infant]["label"] for infant in infants])

    etl.preprocess_pooled()
    etl.generate_fourier_dataset(window_overlap=1)

    X = pd.DataFrame()
    for train_index, test_index in kf.split(infants, labels):

        ids = infants[train_index]

        id_hash = f"{model_name}_{sha1(ids).hexdigest()[:5]}"
        model_path = f"saved_models/{id_hash}.joblib"

        if os.path.exists(model_path):
            models = joblib.load(model_path)
        else:
            models = {}
            for window_size in window_sizes:
                for angle in angles:
                    fourier_path = os.path.join(DATA_PATH, str(window_size),
                                                angle + ".json")
                    df = pd.read_json(fourier_path)
                    X = X.append(df)
                X = X[X.id.isin(ids)]
                y = X["label"]
                X = pd.DataFrame(X.data.tolist())

                # model_name = f"{window_size}_{model_name}"
                models[window_size] = train_model(model_name, X, y, save=False)
            joblib.dump(models, model_path)

        x_test = infants[test_index]
        y_test = labels[test_index]

        score = evaluate_model(id_hash, models, x_test, y_test)

Exemple #5

0

Afficher le fichier

    def predict(self, data_path, infant_id):
        if self.verbose:
            print(
                f"Predicting infant {infant_id} - {strftime('%H:%M:%S', gmtime())}"
            )
        window_sizes = [128, 256, 512, 1024]
        etl = ETL(data_path,
                  window_sizes,
                  pooling="mean",
                  sma_window=3,
                  bandwidth=0,
                  minimal_movement=0.75)
        etl.load_infant(infant_id)
        if self.verbose:
            print(f"Preprocessing the data - {strftime('%H:%M:%S', gmtime())}")
        etl.preprocess_pooled()

        angles = [
            "right_shoulder", "left_shoulder", "right_elbow", "left_elbow",
            "right_hip", "left_hip", "right_knee", "left_knee"
        ]
        predictions = {}
        video_length = len(etl.cima[infant_id]["data"])
        prediction = Prediction(video_length)
        for angle in angles:
            predictions[angle] = pd.Series(
                [[] for i in range(len(etl.cima[infant_id]["data"]))])

        if self.verbose:
            print(
                f"Generating fourier data - {strftime('%H:%M:%S', gmtime())}")
        for window_size in window_sizes:
            for angle in angles:
                dataframe = etl.generate_fourier_data(angle, window_size,
                                                      window_size // 4)
                data_features = pd.DataFrame(dataframe.data.tolist())
                if not data_features.empty:
                    data_transformed = self.model[window_size][
                        "pls"].transform(data_features)
                    dataframe["label"] = self.model[window_size][
                        "model"].predict_proba(data_transformed)
                else:
                    dataframe["label"] = pd.Series([])
                prediction.set_window_data(window_size, angle, dataframe)

        infant = etl.cima[infant_id]
        infant["predictions"] = prediction

        return infant, prediction

Exemple #6

0

Afficher le fichier

def main():
    """
    Main function to run Neural Network
    """
    # Parse arguments
    arguments = args()

    # Set up kwargs for ETL
    kwargs = {
        'data_name': arguments.data_name,
        'random_state': arguments.random_state
    }
    etl = ETL(**kwargs)

    # Set up kwargs and create object
    kwargs = {
        'etl': etl,
        'hidden_layers_count': arguments.hidden_layers_count,
        'step_size': arguments.step_size,
        'node_count': arguments.node_count,
        'convergence_threshold': arguments.convergence_threshold,
        'random_state': arguments.random_state
    }
    model = NeuralNetwork(**kwargs)

    # Tune
    if arguments.tune:
        if arguments.tune not in ('s', 'n', 'c'):
            raise ValueError(
                'Please pass s, n, or c to tune the corresponding parameter')
        model.tune(arguments.tune)

    else:
        # Fit
        model.fit()

        # Predict
        model.predict()

        # Summarize
        model.summarize()

Exemple #7

0

Afficher le fichier

Fichier : main.py Projet : cuthai/ML_Assignment3

def main():
    """
    Main function to run Decision Tree Classifier/Regressor
    """
    # Parse arguments
    arguments = args()

    # Set up kwargs for ETL
    kwargs = {
        'data_name': arguments.data_name,
        'random_state': arguments.random_state
    }
    etl = ETL(**kwargs)

    # Decision Tree
    # Classification
    if arguments.data_name in ['breast-cancer', 'car', 'segmentation']:
        # Set up kwargs
        kwargs = {'etl': etl, 'prune': arguments.prune}

        dt_model = ID3Classifier(**kwargs)
    # Regression
    else:
        # Set up kwargs
        kwargs = {'etl': etl, 'percent_threshold': arguments.percent_threshold}

        dt_model = CARTRegressor(**kwargs)

        # Tune
        if arguments.tune:
            dt_model.tune()

    # Fit
    dt_model.fit()

    # Predict
    dt_model.predict()

    # Summarize
    dt_model.summarize()

Exemple #8

0

Afficher le fichier

class TestETL(unittest.TestCase):

    def setUp(self) -> None:
        self.processor = ETL()
        self.test_file_name = 'lake/2019-02-15.tsv'

    def test_get_list_of_files_from_s3(self):
        list_of_files = self.processor.get_list_of_files_from_s3()
        print(list_of_files)
        assert isinstance(list_of_files, list)
        assert len(list_of_files) > 0
        assert len([i for i in list_of_files if not str(i).endswith('tsv')]) == 0

    def test_extract(self):
        local_path = self.processor.extract('lake/2019-02-15.tsv')

        assert local_path == self.test_file_name.split('/')[1]

    def test_transform(self):
        if len(self.processor.df) == 0:
            self.processor.extract(self.test_file_name)
        self.processor.transform()

        assert hasattr(self.processor, 'article_performance_df')
        assert hasattr(self.processor, 'user_performance_df')
        assert len(self.processor.article_performance_df) > 0
        assert len(self.processor.article_performance_df) == len(self.processor.article_performance_df)

    def test_download_file(self):
        local_path = self.processor.download_file(self.test_file_name)

        assert local_path in os.listdir()

    def tearDown(self) -> None:
        for file in [i for i in os.listdir() if i.endswith('.tsv')]:
            os.remove(file)

Exemple #9

0

Afficher le fichier

from etl.etl import ETL
from matplotlib import pyplot as plt


etl = ETL("/home/erlend/datasets", [128, 256, 512, 1024], size=16, random_seed=42)
etl.cache = False
etl.load("CIMA")


infant = etl.cima["077"]
infant = etl.resample(infant)
before_sma = infant["data"]["right_wrist_x"][:250]

etl.preprocess_pooled()

after_sma = etl.cima["077"]["data"]["right_wrist_x"][:250]

fig = plt.Figure()

plt.plot(before_sma, color="red", alpha=0.5)
plt.plot(after_sma, color="green", alpha=0.5)

plt.xlabel("Frame")
plt.ylabel("right_wrist_x")
plt.legend(["Raw data", "SMA=3"])

plt.savefig("sma.png")

Exemple #10

0

Afficher le fichier

Fichier : etl.py Projet : ashish-bagri/cgm-ml

from etl.etl import ETL
import sys
import logging
import yaml
from logging.config import dictConfig

dictConfig(yaml.safe_load(open('etl/logging.yaml')))
log = logging.getLogger()

if __name__ == '__main__':
    config_path = sys.argv[1]
    log.info("The config path is %s" % config_path)
    etl_process = ETL()
    etl_process.initialize(config_path)
    etl_process.run()

Exemple #11

0

Afficher le fichier

from time import sleep

from etl.etl import ETL

if __name__ == '__main__':

    sleep(10)

    processor = ETL()
    processor.run()

Exemple #12

0

Afficher le fichier

Fichier : minimal_movement.py Projet : eskarpnes/anomove

from etl.etl import ETL
from tqdm import tqdm
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

window_sizes = [128, 256, 512, 1024]
etl = ETL("/home/erlend/datasets", window_sizes)
etl.load("CIMA")
etl.preprocess_pooled()
angles = etl.angles.keys()
differences = {}

for window_size in tqdm(window_sizes):
    etl.differences = []
    for angle in angles:
        etl.generate_fourier_data(angle, window_size, window_size)
    differences[window_size] = etl.differences

bins = [0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5]

for window_size in window_sizes:
    fig = plt.figure()
    plt.hist(differences[window_size],
             color="mediumslateblue",
             bins=bins,
             weights=np.ones(len(differences[window_size])) /
             len(differences[window_size]),
             edgecolor="black",
             linewidth=0.5)

Exemple #13

0

Afficher le fichier

Fichier : etl.py Projet : torresxavier/cgm-ml

from etl.etl import ETL
import sys
import logging
import yaml
from logging.config import dictConfig

dictConfig(yaml.safe_load(open('etl/logging.yaml')))
log = logging.getLogger()

if __name__ == '__main__':
    config_path = sys.argv[1]
    log.info("The config path is %s" % config_path)
    etl_process = ETL(simulate="simulate" in sys.argv)
    etl_process.initialize(config_path)
    etl_process.run()

Exemple #14

0

Afficher le fichier

Fichier : anomove.py Projet : eskarpnes/anomove

def load_validation_set(data_path):
    etl = ETL(data_path, [128, 256, 512, 1024])
    etl.load("CIMA", validation=True)
    etl.preprocess_pooled()
    return etl.cima

Exemple #15

0

Afficher le fichier

from etl.etl import ETL
from etl.mongo import MongoETL
from etl.neo import NeoETL

if __name__ == '__main__':
    ETL.extract()
    for task in [MongoETL(), NeoETL()]:
        task.run()

Exemple #16

0

Afficher le fichier

def modeling():

    options = [{
        'title': 'iris',
        'id': 1,
    }]
    print(options)

    elements = [
        {
            'title': 'Network layers number',
            'id': 'layers_n',
            'type': '',
            'default': 1,
        },
        {
            'title': 'Neuron number',
            'id': 'nn',
            'type': '',
            'default': 10,
        },
        {
            'title': 'Activation functions list',
            'id': 'func',
            'type': '',
            'default': 'sigmoid',
        },
        {
            'title': 'Metrics',
            'id': 'metrics',
            'type': '',
            'default': 'accuracy',
        },
        {
            'title': 'Loss',
            'id': 'loss',
            'type': '',
            'default': 'categorical_crossentropy',
        },
        {
            'title': 'Epoch number',
            'id': 'ep',
            'type': '',
            'default': '100',
        },
        {
            'title': 'Datasets',
            'id': 'dataset',
            'type': '',
            'options': options,
            'default': options[0]['id'],
        },
    ]
    return_url = "/"

    if request.args.get('result'):

        dataset_to_comps = [
            'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
        ]  # more tables???
        model_info, datasets = get_model_info(base, request.args.get('models'))

        neurons = request.args.get('nn').split(',')
        input_dim = [len(dataset_to_comps)] + [0] * (len(neurons) - 1)
        activation = request.args.get('func').split(',')

        etl = ETL(manager=base)
        load_data_instr = {"category_name": 'Iris Fisher'}
        path = 'local_files/iris.csv'
        etl.load_supervised_data(path=path,
                                 ctg_name=load_data_instr["category_name"])

        #        x1 = base.get_raw_data(RateName=dataset_to_comps[0])
        #        x1 = pd.DataFrame(x1[2].float_value)
        #        x2 = base.get_raw_data(RateName=dataset_to_comps[1])
        #        x2 = pd.DataFrame(x2[2].float_value)
        #        x3 = base.get_raw_data(RateName=dataset_to_comps[2])
        #        x3 = pd.DataFrame(x3[2].float_value)
        #        x4 = base.get_raw_data(RateName=dataset_to_comps[3])
        #        x4 = pd.DataFrame(x4[2].float_value)

        X = pd.read_csv(path)
        y = X['species']
        X = X.drop('species', axis=1)

        X = X.as_matrix()
        train_X, test_X, train_y, test_y = train_test_split(X,
                                                            y,
                                                            train_size=0.7,
                                                            random_state=42)
        train_y_ohe = np.array(get_dummies(train_y), dtype=np.float64)
        test_y_ohe = np.array(get_dummies(test_y), dtype=np.float64)

        #        build_args = {
        #            'build_args': [
        #                {'neurons': neurons[i], 'input_dim': input_dim[i], 'activation': activation[i], 'init': 'normal'} for i in range(len(neurons))
        ##                {'neurons' : 16, 'input_dim' : 4, 'init' : 'normal', 'activation' : 'relu'},
        ##                {'neurons' : 3, 'input_dim' : 0, 'init' : 'normal', 'activation' : 'sigmoid'}
        #                ],
        #            'compile_args': {
        #                    'loss': request.args.get('loss'),
        #                    'optimizer': 'adam',
        #                    'metrics': request.args.get('metrics')
        #            }
        #        }
        #        compile_args = {
        #                    'loss': request.args.get('loss'),
        #                    'optimizer': 'adam',
        #                    'metrics': request.args.get('metrics')
        #            }
        #        fit_args = {'nb_epoch': request.args.get('ep'), 'batch_size': 1, 'verbose': 0}
        #        evaluate_args = {'verbose': 0}
        #        predict_args = {}

        build_args = {
            'build_args': [{
                'neurons': 16,
                'input_dim': 4,
                'init': 'normal',
                'activation': 'relu'
            }, {
                'neurons': 3,
                'input_dim': 0,
                'init': 'normal',
                'activation': 'sigmoid'
            }],
            'compile_args': {
                'loss': 'categorical_crossentropy',
                'optimizer': 'adam',
                'metrics': 'accuracy'
            }
        }
        compile_args = {
            'loss': 'categorical_crossentropy',
            'optimizer': 'adam',
            'metrics': 'accuracy'
        }
        fit_args = {'epochs': 100, 'batch_size': 1, 'verbose': 1}
        evaluate_args = {'verbose': 0}
        predict_args = {}

        print(build_args)

        m = KerasClassifier(name='iris', args=build_args)
        history = m.fit(train_X, train_y_ohe, fit_args=fit_args)
        loss, accuracy = m.evaluate(test_X, test_y_ohe, evaluate_args)
        prediction = m.predict(train_X)

        loss_data = history.history['loss'][1:]

        return render_template("modeling.html",
                               elements=elements,
                               return_url=return_url,
                               loss=request.args.get('loss'),
                               loss_data=list(
                                   zip(list(range(len(loss_data) - 1)),
                                       loss_data)))
    else:

        return render_template(
            "input.html",
            elements=elements,
            return_url=return_url,
        )

Exemple #17

0

Afficher le fichier

Fichier : run.py Projet : ilyshnikova/OpenNeurons

from etl.etl import ETL
from etl.quandl import Quandl as q

from sklearn.model_selection          import train_test_split
from pandas                           import get_dummies

import numpy as np
import pandas as pd
import datetime
from models.models import *


if __name__ == '__main__':
    # Connection to DataBase and assemble Scheme
    DB = DBManager()
    etl = ETL(manager=DB)

##### LOADING DATA FROM VARIOUS SOURCES

    # Download local files for superviesd learning
    load_data_instr = {"category_name": 'Iris Fisher'}
    etl.load_supervised_data(path='local_files/iris.csv', ctg_name=load_data_instr["category_name"])

    # Define categories for JapanExchange_Derivatives_ex2
    cats = [Category(name='futures', description='azaza'),
            Category(name='call', description='azaza'),
            Category(name='put', description='azaza'),
            Category(name='cbr', description='azaza')]
    DB.session.add_all(cats)

    # Import Future Data

Exemple #18

0

Afficher le fichier

 def setUp(self) -> None:
     self.processor = ETL()
     self.test_file_name = 'lake/2019-02-15.tsv'

Exemple #19

0

Afficher le fichier

from manager.dbmanager import DBManager
from etl.etl import ETL

manager = DBManager()
etl = ETL(manager=manager)
etl.get_Kospi_data_ex1("local_files/kospi.xlsx")

Exemple #20

0

Afficher le fichier

Fichier : anomove.py Projet : eskarpnes/anomove

def load_infant(data_path):
    window_sizes = [128, 256, 512, 1024]
    etl = ETL(data_path, window_sizes)