def __init__(self, exchange, pair, interval, past_history, model_file): self.exchange = exchange self.pair = pair self.interval = interval self.past_history = past_history self.dataset = Dataset() self.model_file = model_file
def prepare_data_for_training(data_input_folder, validation_split, augment_training_data): dataset = Dataset(data_input_folder) training_path, validation_path = dataset.split_datasets(validation_split) dataset.preprocess_data(training_path, validation_path, augment_training_data) return training_path, validation_path
def to_dateframe(data, timestamp=None): columns = [ 'datetime', 'open', 'high', 'low', 'close', 'vwap', 'volume', 'count' ] df = pd.DataFrame(data, columns=columns) df = df.astype(float) dataset = Dataset() df = dataset.add_indicators(df) df.dropna(inplace=True) df.pop('vwap') df.pop('count') df.pop('open') df.pop('high') df.pop('low') df.pop('volume') df['datetime'] = df.datetime.values.astype(np.int64) // 10**9 if timestamp: df = df[df['datetime'] < timestamp] return df
def init_dataset(database_name): """ PURPOSE Read in dataset. Set up the test and learn sets. INPUT database_name: name (and path) of database OUTPUT newdata: dataset object """ newdata = Dataset() newdata.get_data(database_name) return newdata
class TradingBot: def __init__(self, exchange, pair, interval, past_history, model_file): self.exchange = exchange self.pair = pair self.interval = interval self.past_history = past_history self.dataset = Dataset() self.model_file = model_file def predict_price(self, timestamp): # Get the timestamp of the last known point, 24h before the given one last_timestamp = int(timestamp - (60 * 60 * 24)) df = self.exchange.fetch(timestamp=last_timestamp, pair=self.pair, hours_back=self.past_history * 2, interval=self.interval) # Only keep the last entries needed df = df.tail(self.past_history) predictions, y = self.dataset.predict(df, self.model_file, with_y=False) print(predictions) print(datetime.datetime.fromtimestamp(timestamp)) exit() def tick(self): return
def getPrices(input, output): prices15 = [] i = 0 short_ns = "eg" ds_name = "%s:dataset-%s " % (short_ns, "prices") ds_prices = Dataset(ds_name) with open(input, newline='') as pricesCSV: reader = csv.DictReader(pricesCSV) for row in reader: price = row['Price'] month = row['Month'] year = row['Year'] short_ns = "eg" i += 1 subject = "%s:obs%d " % (short_ns, i) if year == "2015" or year == "2016" or year == "2017": obs = Observation(subject=subject) prices15.append((price, month)) obs.addDimension(p="dbpedia:month", o=month) obs.addDimension(p="dbpedia:year", o=year) obs.addMeasure(p="cbo:price", o=price) ds_prices.addObservation(obs) ds_prices.saveToDisk(output)
def preprocess_dataset(dataset: Dataset): dataset.X_train = dataset.X_train.reshape(dataset.X_train.shape[0], 28, 28, 1) dataset.X_test = dataset.X_test.reshape(dataset.X_test.shape[0], 28, 28, 1) input_shape = (28, 28, 1) # Adding class vectors to 0-9 digits num_classes = 10 dataset.y_train = tensorflow.keras.utils.to_categorical( dataset.y_train, num_classes=num_classes) dataset.y_test = tensorflow.keras.utils.to_categorical( dataset.y_test, num_classes=num_classes) dataset.X_train = dataset.X_train.astype("float32") dataset.X_test = dataset.X_test.astype("float32") dataset.X_train /= 255 dataset.X_test /= 255
def getRainfall(path, outputPath): short_ns = "eg" ds_name = "%s:dataset-%s " % (short_ns, "prices") ds_rainfall = Dataset(ds_name) # Obtain rainfall aggregate values from CHIRPS ds, img_array, zim, gdf = readImage(path) total = getSum(gdf, zim, img_array, ds) subject1 = "%s:obs1" obs1 = Observation(subject=subject1) obs1.addDimension(p="dbpedia:month", o="1") obs1.addDimension(p="dbpedia:year", o="2015") obs1.addMeasure(p="cf-feature:rainfall_amount", o=total) print("Total rainfall 2015: %s" % total) ds_rainfall.addObservation(obs1) ds_rainfall.saveToDisk(outputPath)
def getRainfallValues(input, output): prefixes = "time : <http://www.w3.org/2006/time#>" \ "cbo: http://comicmeta.org/cbo/price" i = 0 short_ns = "eg" ds_name = "%s:dataset-%s " % (short_ns, "prices") ds_chirps = Dataset(ds_name) with open(input, newline='') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if (row['Dekad'] == "1"): i = i + 1 subject = "%s:obs%d " % (short_ns, i) obs = Observation(subject=subject) obs.addDimension(p="dbpedia:month", o=row['Month']) obs.addDimension(p="dbpedia:year", o=row['Year']) obs.addMeasure(p="cf-feature:rainfall_amount", o=row['Rainfall (mm)']) ds_chirps.addObservation(obs) ds_chirps.saveToDisk(output)
def generate_model(model_dirpath: str, x_leftmost: int = 1, x_rightmost: int = 102) -> keras.Model: '''Generate (create, train, validate, test and save) a new model based on a generated temporary dataset. :param model_dirpath: the path where the model's files will be saved :param x_leftmost: the leftest bound of the generated temporary dataset's X :param x_rightmost: the rightest bound of the generated temporary dataset's X :returns: trained multi-layer perceptron NN model ''' ############################## # Generate temporary dataset ############################## num_of_features = x_rightmost - x_leftmost - 1 num_of_datapoints = 10000 ds = Dataset(x_leftmost, x_rightmost, num_of_datapoints) ds.generate() ####### # Prepare vectorized subsets for the model ####### print("Vectorizing subsets...", end="", flush=True) # Get subsets (as input (X) and output (Y) variables) in matrix form from generated dataset X_train, y_train = ds.vectorized_X_Y_train X_validation, y_validation = ds.vectorized_X_Y_valid X_test, y_test = ds.vectorized_X_Y_test print("...OK\n") ############################## # Create the model (multi-layer perceptron) ############################## print("Model creating...", end="", flush=True) ####### # Model specification ####### # Input - datapoint's feature vector mlp = keras.Sequential([ layers.Input(shape=(num_of_features, ), dtype=np.float64, name="input_layer"), layers.Dense(num_of_features * 2, activation='relu', name="hidden_layer1"), layers.Dropout(0.2), # Prevent overfitting layers.Dense(num_of_features * 2, activation='relu', name="hidden_layer2"), layers.Dropout(0.2), layers.Dense(1, activation='sigmoid', name="output_layer") ]) # Output - probability of the given feature vector belonging to a 2-spiked Gaussian Mixture class ####### # Model compilation ####### mlp.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy']) print("...OK\n") ####### # Train and validate the model ####### print("Model training...") training_results = mlp.fit(X_train, y_train, validation_data=(X_validation, y_validation), epochs=100, batch_size=8, verbose=2, shuffle=True) print('TRAIN {accuracy: %.2f}, {loss: %.2f}' % (training_results.history['accuracy'][-1] * 100, training_results.history['loss'][-1])) print('VALID {accuracy: %.2f}, {loss: %.2f}\n' % (training_results.history['val_accuracy'][-1] * 100, training_results.history['val_loss'][-1])) ####### # Test the model ####### print("Model evaluation...") test_results = mlp.evaluate(X_test, y_test, verbose=0) print("TEST {%s: %.2f}, {loss: %.2f}\n" % (mlp.metrics_names[1], test_results[1] * 100, test_results[0])) ####### # Save model ####### print("Saving model...", end="", flush=True) try: # Save model's computational graph with weights mlp.save(model_dirpath) except IOError as e: print("...IOError: <" + str(e) + ">\n") else: print("...OK\n") return mlp
# Tasks to run do_cv_set_creation = config.get_entry("global", "cv_set_creation") do_pheno_imputation = config.get_entry("global", "phenotype_imputation") do_univ_feature_sel = config.get_entry("global", "univ_feature_sel") do_random_forest = config.get_entry("global", "random_forest") # ------------------------------------------------------------------------- # Create the log file logging.basicConfig(filename="%s/exec.log" % output_dir, filemode='w', level=logging.INFO, format="[%(asctime)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S") # ------------------------------------------------------------------------- # Load dataset in Dataset class object logging.info("Loading dataset") data = Dataset() data.load_dataset(config) logging.info("End") # ------------------------------------------------------------------------- # Create the cotraining folds # This step creates an index indicating what records are randomly assigned # to sets I, II and III if do_cv_set_creation: logging.info("Starting task: cv_set_creation") cv_set_creation(data.num_samples, config) logging.info("End") else: logging.info("Skipping task: cv_set_creation") # -------------------------------------------------------------------------
print("Yield batch") # print("sample_in_batch_counter", sample_in_batch_counter) # print("batch_partial_sequences", batch_partial_sequences) # print("batch_next_words", batch_next_words) yield ([batch_input_images, batch_partial_sequences], batch_next_words) batch_input_images = [] batch_partial_sequences = [] batch_next_words = [] sample_in_batch_counter = 0 if(__name__=="__main__"): input_path = "../../../datasets/navbar-dataset/" output_path = "../../../datasets/test_gen/" dataset = Dataset() # generate_binary_sequences=False 意味着不对partial_sequences进行one-hot编码 dataset.load(input_path, generate_binary_sequences=False) # 生成meta_dataset.npy文件 dataset.save_metadata(output_path) # 生成words.vocab文件 dataset.voc.save(output_path) gui_paths, img_paths = Dataset.load_paths_only(input_path) time_start = time.time() test_gen = Generator() gen = test_gen.data_generator(dataset.voc, gui_paths, img_paths,32) next(gen) time_end = time.time() print('time cost', (time_end - time_start), 's')
close_index = 0 # Number of features for prediction features_count = df_array.shape[1] num_units = 64 learning_rate = 0.0001 activation_function = 'sigmoid' adam = Adam(lr=learning_rate) loss_function = 'mse' batch_size = 256 num_epochs = 100 # Train on 80% of the dataset train_split = int(df_array.shape[0] * 0.8) dataset = Dataset() x_train, y_train = dataset.prepare(scaled, 0, train_split, with_y=True) x_test, y_test = dataset.prepare(scaled, train_split, None, with_y=True) y_test_inverse = y_min_max_scaler.inverse_transform(y_test) def train(x_train, y_train, x_test, y_test): # Initialize the RNN model = Sequential() model.add(LSTM(units=num_units, return_sequences=True, input_shape=(None, features_count))) model.add(Dropout(0.5)) model.add(LSTM(units=num_units, return_sequences=True)) model.add(Dropout(0.5)) model.add(LSTM(units=num_units, return_sequences=True)) model.add(Dropout(0.5)) model.add(LSTM(units=num_units))
import sys import pandas as pd import numpy as np from matplotlib.pyplot import figure import seaborn as sns from matplotlib import pyplot as plt from classes.dataset import Dataset file_name = sys.argv[1] dataset = Dataset() df = pd.read_csv(file_name, index_col=None) df = df.sort_values('datetime') df = df.dropna() #df = df[22000:22300] # Number of features for prediction features_count = df.shape[1] # Make predictions using the reference model predictions_reference, y = dataset.predict(df, './models/reference_model', True) # Make predictions using the latest model if available try: has_latest_model = True predictions_latest, _ = dataset.predict(df, './models/latest', True) print('Mean error latest : ' + str(dataset.mean_error(np.array(predictions_latest), y))) success_rate_latest = dataset.get_trend_success_rate(predictions_latest, y, df) print('Trend prediction success for latest : ' + str(success_rate_latest) + '%')
def load_mnist_dataset() -> Dataset: (X_train, y_train), (X_test, y_test) = mnist.load_data() X_val = None y_val = None return Dataset(X_train, X_test, y_train, y_test, X_val, y_val)