Python Dataset Examples, classes.dataset.Dataset Python Examples

Example #1

0

Show file

File: trading_bot.py Project: JulienHenchoz/crypto-lstm

 def __init__(self, exchange, pair, interval, past_history, model_file):
     self.exchange = exchange
     self.pair = pair
     self.interval = interval
     self.past_history = past_history
     self.dataset = Dataset()
     self.model_file = model_file

Example #2

0

Show file

File: ModelUtils.py Project: rishabhofmishra/amdocs-2019-prototype

    def prepare_data_for_training(data_input_folder, validation_split, augment_training_data):

        dataset = Dataset(data_input_folder)
        training_path, validation_path = dataset.split_datasets(validation_split)
        dataset.preprocess_data(training_path, validation_path, augment_training_data)

        return training_path, validation_path

Example #3

0

Show file

    def to_dateframe(data, timestamp=None):
        columns = [
            'datetime', 'open', 'high', 'low', 'close', 'vwap', 'volume',
            'count'
        ]
        df = pd.DataFrame(data, columns=columns)
        df = df.astype(float)

        dataset = Dataset()
        df = dataset.add_indicators(df)

        df.dropna(inplace=True)
        df.pop('vwap')
        df.pop('count')
        df.pop('open')
        df.pop('high')
        df.pop('low')
        df.pop('volume')

        df['datetime'] = df.datetime.values.astype(np.int64) // 10**9

        if timestamp:
            df = df[df['datetime'] < timestamp]

        return df

Example #4

0

Show file

File: id3.py Project: wlacaba/COMP4710-Group-12

def init_dataset(database_name):
    """
    PURPOSE
    Read in dataset. Set up the test and learn sets. 

    INPUT
    database_name: name (and path) of database

    OUTPUT
    newdata: dataset object
    """
    newdata = Dataset()
    newdata.get_data(database_name)

    return newdata

Example #5

0

Show file

File: trading_bot.py Project: JulienHenchoz/crypto-lstm

class TradingBot:
    def __init__(self, exchange, pair, interval, past_history, model_file):
        self.exchange = exchange
        self.pair = pair
        self.interval = interval
        self.past_history = past_history
        self.dataset = Dataset()
        self.model_file = model_file

    def predict_price(self, timestamp):
        # Get the timestamp of the last known point, 24h before the given one
        last_timestamp = int(timestamp - (60 * 60 * 24))

        df = self.exchange.fetch(timestamp=last_timestamp,
                                 pair=self.pair,
                                 hours_back=self.past_history * 2,
                                 interval=self.interval)
        # Only keep the last entries needed
        df = df.tail(self.past_history)

        predictions, y = self.dataset.predict(df,
                                              self.model_file,
                                              with_y=False)
        print(predictions)
        print(datetime.datetime.fromtimestamp(timestamp))
        exit()

    def tick(self):
        return

Example #6

0

Show file

File: Main.py Project: scerrisimon/eohackathon-better-semantics

def getPrices(input, output):
    prices15 = []
    i = 0
    short_ns = "eg"

    ds_name = "%s:dataset-%s " % (short_ns, "prices")
    ds_prices = Dataset(ds_name)

    with open(input, newline='') as pricesCSV:
        reader = csv.DictReader(pricesCSV)
        for row in reader:
            price = row['Price']
            month = row['Month']
            year = row['Year']
            short_ns = "eg"

            i += 1
            subject = "%s:obs%d " % (short_ns, i)

            if year == "2015" or year == "2016" or year == "2017":
                obs = Observation(subject=subject)
                prices15.append((price, month))
                obs.addDimension(p="dbpedia:month", o=month)
                obs.addDimension(p="dbpedia:year", o=year)
                obs.addMeasure(p="cbo:price", o=price)

                ds_prices.addObservation(obs)
        ds_prices.saveToDisk(output)

Example #7

0

Show file

File: tools.py Project: at-it/Handwritting_analysis

def preprocess_dataset(dataset: Dataset):
    dataset.X_train = dataset.X_train.reshape(dataset.X_train.shape[0], 28, 28,
                                              1)
    dataset.X_test = dataset.X_test.reshape(dataset.X_test.shape[0], 28, 28, 1)
    input_shape = (28, 28, 1)

    # Adding class vectors to 0-9 digits
    num_classes = 10
    dataset.y_train = tensorflow.keras.utils.to_categorical(
        dataset.y_train, num_classes=num_classes)
    dataset.y_test = tensorflow.keras.utils.to_categorical(
        dataset.y_test, num_classes=num_classes)

    dataset.X_train = dataset.X_train.astype("float32")
    dataset.X_test = dataset.X_test.astype("float32")

    dataset.X_train /= 255
    dataset.X_test /= 255

Example #8

0

Show file

File: Main.py Project: scerrisimon/eohackathon-better-semantics

def getRainfall(path, outputPath):
    short_ns = "eg"

    ds_name = "%s:dataset-%s " % (short_ns, "prices")
    ds_rainfall = Dataset(ds_name)

    # Obtain rainfall aggregate values from CHIRPS
    ds, img_array, zim, gdf = readImage(path)
    total = getSum(gdf, zim, img_array, ds)

    subject1 = "%s:obs1"
    obs1 = Observation(subject=subject1)
    obs1.addDimension(p="dbpedia:month", o="1")
    obs1.addDimension(p="dbpedia:year", o="2015")
    obs1.addMeasure(p="cf-feature:rainfall_amount", o=total)

    print("Total rainfall 2015: %s" % total)

    ds_rainfall.addObservation(obs1)

    ds_rainfall.saveToDisk(outputPath)

Example #9

0

Show file

def getRainfallValues(input, output):
    prefixes = "time : <http://www.w3.org/2006/time#>" \
               "cbo: http://comicmeta.org/cbo/price"
    i = 0
    short_ns = "eg"

    ds_name = "%s:dataset-%s " % (short_ns, "prices")
    ds_chirps = Dataset(ds_name)

    with open(input, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if (row['Dekad'] == "1"):
                i = i + 1
                subject = "%s:obs%d " % (short_ns, i)
                obs = Observation(subject=subject)
                obs.addDimension(p="dbpedia:month", o=row['Month'])
                obs.addDimension(p="dbpedia:year", o=row['Year'])
                obs.addMeasure(p="cf-feature:rainfall_amount",
                               o=row['Rainfall (mm)'])

                ds_chirps.addObservation(obs)

    ds_chirps.saveToDisk(output)

Example #10

0

Show file

def generate_model(model_dirpath: str,
                   x_leftmost: int = 1,
                   x_rightmost: int = 102) -> keras.Model:
    '''Generate (create, train, validate, test and save) a new model based on a generated temporary dataset.

    :param model_dirpath: the path where the model's files will be saved
    :param x_leftmost: the leftest bound of the generated temporary dataset's X
    :param x_rightmost: the rightest bound of the generated temporary dataset's X

    :returns: trained multi-layer perceptron NN model

    '''
    ##############################
    # Generate temporary dataset
    ##############################
    num_of_features = x_rightmost - x_leftmost - 1
    num_of_datapoints = 10000
    ds = Dataset(x_leftmost, x_rightmost, num_of_datapoints)
    ds.generate()

    #######
    # Prepare vectorized subsets for the model
    #######
    print("Vectorizing subsets...", end="", flush=True)

    # Get subsets (as input (X) and output (Y) variables) in matrix form from generated dataset
    X_train, y_train = ds.vectorized_X_Y_train
    X_validation, y_validation = ds.vectorized_X_Y_valid
    X_test, y_test = ds.vectorized_X_Y_test

    print("...OK\n")

    ##############################
    # Create the model (multi-layer perceptron)
    ##############################
    print("Model creating...", end="", flush=True)

    #######
    # Model specification
    #######
    # Input - datapoint's feature vector
    mlp = keras.Sequential([
        layers.Input(shape=(num_of_features, ),
                     dtype=np.float64,
                     name="input_layer"),
        layers.Dense(num_of_features * 2,
                     activation='relu',
                     name="hidden_layer1"),
        layers.Dropout(0.2),  # Prevent overfitting
        layers.Dense(num_of_features * 2,
                     activation='relu',
                     name="hidden_layer2"),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid', name="output_layer")
    ])
    # Output - probability of the given feature vector belonging to a 2-spiked Gaussian Mixture class

    #######
    # Model compilation
    #######
    mlp.compile(optimizer='Adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
    print("...OK\n")

    #######
    # Train and validate the model
    #######
    print("Model training...")

    training_results = mlp.fit(X_train,
                               y_train,
                               validation_data=(X_validation, y_validation),
                               epochs=100,
                               batch_size=8,
                               verbose=2,
                               shuffle=True)

    print('TRAIN {accuracy: %.2f}, {loss: %.2f}' %
          (training_results.history['accuracy'][-1] * 100,
           training_results.history['loss'][-1]))
    print('VALID {accuracy: %.2f}, {loss: %.2f}\n' %
          (training_results.history['val_accuracy'][-1] * 100,
           training_results.history['val_loss'][-1]))

    #######
    # Test the model
    #######
    print("Model evaluation...")

    test_results = mlp.evaluate(X_test, y_test, verbose=0)
    print("TEST {%s: %.2f}, {loss: %.2f}\n" %
          (mlp.metrics_names[1], test_results[1] * 100, test_results[0]))

    #######
    # Save model
    #######
    print("Saving model...", end="", flush=True)

    try:
        # Save model's computational graph with weights
        mlp.save(model_dirpath)

    except IOError as e:
        print("...IOError: <" + str(e) + ">\n")
    else:
        print("...OK\n")

    return mlp

Example #11

0

Show file

File: run_cotraining.py Project: mennowitteveen/PhenotypeImputation_via_Cotraining

    # Tasks to run
    do_cv_set_creation  = config.get_entry("global", "cv_set_creation")
    do_pheno_imputation = config.get_entry("global", "phenotype_imputation")
    do_univ_feature_sel = config.get_entry("global", "univ_feature_sel")
    do_random_forest    = config.get_entry("global", "random_forest")
    
    # -------------------------------------------------------------------------
    # Create the log file
    logging.basicConfig(filename="%s/exec.log" % output_dir, filemode='w',
                        level=logging.INFO,
                        format="[%(asctime)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

    # -------------------------------------------------------------------------
    # Load dataset in Dataset class object
    logging.info("Loading dataset")
    data = Dataset()
    data.load_dataset(config)
    logging.info("End")

    # -------------------------------------------------------------------------
    # Create the cotraining folds
    # This step creates an index indicating what records are randomly assigned 
    # to sets I, II and III
    if do_cv_set_creation:
        logging.info("Starting task: cv_set_creation")
        cv_set_creation(data.num_samples, config)
        logging.info("End")
    else:
        logging.info("Skipping task: cv_set_creation")

    # -------------------------------------------------------------------------

Example #12

0

Show file

                            print("Yield batch")
                        # print("sample_in_batch_counter", sample_in_batch_counter)
                        # print("batch_partial_sequences", batch_partial_sequences)
                        # print("batch_next_words", batch_next_words)
                        yield ([batch_input_images, batch_partial_sequences], batch_next_words)

                        batch_input_images = []
                        batch_partial_sequences = []
                        batch_next_words = []
                        sample_in_batch_counter = 0


if(__name__=="__main__"):
    input_path = "../../../datasets/navbar-dataset/"
    output_path = "../../../datasets/test_gen/"
    dataset = Dataset()
    # generate_binary_sequences=False 意味着不对partial_sequences进行one-hot编码
    dataset.load(input_path, generate_binary_sequences=False)
    # 生成meta_dataset.npy文件
    dataset.save_metadata(output_path)
    # 生成words.vocab文件
    dataset.voc.save(output_path)

    gui_paths, img_paths = Dataset.load_paths_only(input_path)

    time_start = time.time()
    test_gen = Generator()
    gen = test_gen.data_generator(dataset.voc, gui_paths, img_paths,32)
    next(gen)
    time_end = time.time()
    print('time cost', (time_end - time_start), 's')

Example #13

0

Show file

close_index = 0

# Number of features for prediction
features_count = df_array.shape[1]
num_units = 64
learning_rate = 0.0001
activation_function = 'sigmoid'
adam = Adam(lr=learning_rate)
loss_function = 'mse'
batch_size = 256
num_epochs = 100

# Train on 80% of the dataset
train_split = int(df_array.shape[0] * 0.8)

dataset = Dataset()
x_train, y_train = dataset.prepare(scaled, 0, train_split, with_y=True)
x_test, y_test = dataset.prepare(scaled, train_split, None, with_y=True)

y_test_inverse = y_min_max_scaler.inverse_transform(y_test)

def train(x_train, y_train, x_test, y_test):
    # Initialize the RNN
    model = Sequential()
    model.add(LSTM(units=num_units, return_sequences=True, input_shape=(None, features_count)))
    model.add(Dropout(0.5))
    model.add(LSTM(units=num_units, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(units=num_units, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(units=num_units))

Example #14

0

Show file

File: predict.py Project: JulienHenchoz/crypto-lstm

import sys
import pandas as pd
import numpy as np
from matplotlib.pyplot import figure
import seaborn as sns
from matplotlib import pyplot as plt

from classes.dataset import Dataset

file_name = sys.argv[1]

dataset = Dataset()
df = pd.read_csv(file_name, index_col=None)
df = df.sort_values('datetime')

df = df.dropna()
#df = df[22000:22300]

# Number of features for prediction
features_count = df.shape[1]

# Make predictions using the reference model
predictions_reference, y = dataset.predict(df, './models/reference_model', True)

# Make predictions using the latest model if available
try:
    has_latest_model = True
    predictions_latest, _ = dataset.predict(df, './models/latest', True)
    print('Mean error latest : ' + str(dataset.mean_error(np.array(predictions_latest), y)))
    success_rate_latest = dataset.get_trend_success_rate(predictions_latest, y, df)
    print('Trend prediction success for latest : ' + str(success_rate_latest) + '%')

Example #15

0

Show file

File: tools.py Project: at-it/Handwritting_analysis

def load_mnist_dataset() -> Dataset:
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    X_val = None
    y_val = None
    return Dataset(X_train, X_test, y_train, y_test, X_val, y_val)