class PipelineTest(unittest.TestCase): def setUp(self): self.mi = MarketInsights(CredentialsStore()) credStore = CredentialsStore() self.fun = Functions(credStore) def testDatasetGeneration(self): DATASET_ID = "4234f0f1b6fcc17f6458696a6cdf5101" dataset_desc_file = root_dir + "../../../marketinsights-data/datasets/WallSt-FinalTradingHour.json" with open(dataset_desc_file) as data_file: dataset_desc = json.load(data_file)["dataset_desc"] # Test id generation self.assertEqual(Dataset.generateId(dataset_desc, "DOW"), DATASET_ID) ds, _ = self.mi.get_dataset_by_id(DATASET_ID) ds = ds["2013-01-01":"2017-05-18"] expectedDS = pandas.read_csv(root_dir + "testDataset_DOW.csv", index_col=0, parse_dates=True, float_precision='round_trip') expectedDS.index = expectedDS.index.tz_localize("UTC").tz_convert( "US/Eastern") expectedDS.columns = ds.columns # Test correct dataset output self.assertTrue(expectedDS.equals(ds)) def testPipelineAPI(self): dataset_desc_file = root_dir + "../../../marketinsights-data/datasets/WallSt-FinalTradingHour.json" with open(dataset_desc_file) as data_file: dataset_desc = json.load(data_file)["dataset_desc"] ppl_desc = dataset_desc["pipeline"]["pipeline_desc"] with open(root_dir + "data/testRawData.json") as data_file: testRawData = json.load(data_file) data = Dataset.jsontocsv(testRawData) data.columns = ["Open", "High", "Low", "Close"] csvData = { "data": json.loads(data.to_json(orient='split', date_format="iso")), "dataset": ppl_desc } ds = self.fun.call_function("marketdirection", csvData) # Test correct pipeline output self.assertTrue(np.allclose(ds[0][0], 0.568182))
def __init__(self, name, env, credstore, mi_models, aggMethod, threshold=0, barOnly=False, debug=False): Model.__init__(self, name, env) self.miassembly = MIAssembly(MarketInsights(credstore), Functions(credstore)) self.modelConfig = mi_models self.aggMethod = aggMethod self.threshold = threshold self.barOnly = barOnly self.debug = debug return
def __init__(self, name, env, credstore, dataset_id, training_run_id, threshold=0, barOnly=False, debug=False): Model.__init__(self, name, env) self.miassembly = MIAssembly(MarketInsights(credstore), Functions(credstore)) self.dataset_id = dataset_id self.training_run_id = training_run_id self.threshold = threshold self.debug = debug self.barOnly = barOnly return
def setUp(self): # # Get dataset from MI API # # print("Loading data...") mi = MarketInsights(cred) fun = Functions(cred) self.miassembly = MIAssembly(mi, fun) TRAINING_RUN["id"] = cos.generateKey( [str(TRAINING_RUN["datasets"]), str(TRAINING_RUN["model_id"])]) mi.put_training_run(TRAINING_RUN) self.CONFIG = mi.get_model(MODEL_ID) TRN_CNF = self.CONFIG["training"] print("Creating model...") # Create ML model self.ffnn = Model(NUM_FEATURES, NUM_LABELS, self.CONFIG) mkt1, mkt1_desc = mi.get_dataset_by_id(DATASET_ID1) mkt2, mkt2_desc = mi.get_dataset_by_id(DATASET_ID2) # Crop training dates if "training_end_date" in TRN_CNF: mkt1 = mkt1[ TRN_CNF["training_start_date"]:TRN_CNF["training_end_date"]] mkt2 = mkt2[ TRN_CNF["training_start_date"]:TRN_CNF["training_end_date"]] # Interleave (part of the "added insight" for this model) self.mkt1, self.mkt2, self.isect = ppl.intersect(mkt1, mkt2) self.dataset = ppl.interleave(self.mkt1, self.mkt2) self.TRAINING_SET_SIZE = TRN_CNF["training_window_size"] self.TEST_SET_SIZE = len(self.dataset) - self.TRAINING_SET_SIZE self.WINDOW_SIZE = self.TRAINING_SET_SIZE _, self.test_y = ppl.splitCol(self.dataset[self.TRAINING_SET_SIZE:], NUM_FEATURES)
def __init__(self, cred): self.cos = CloudObjectStore(cred) self.mi = MarketInsights(cred)
class MIModelClient(): modelId = None modelInstance = None def __init__(self, cred): self.cos = CloudObjectStore(cred) self.mi = MarketInsights(cred) def score(self, training_id, dataset): training_run = self.mi.get_training_run(training_id) if (not training_run): return "No Training Id found" if (not self.cos.keyExists(COS_BUCKET, training_id)): return "No trained weights found for this training id" model_id = training_run["model_id"] _, dataset_desc = self.mi.get_dataset_by_id( training_run["datasets"] [0]) # TODO this is too heavyweight just to get the desc weights = self.cos.get_csv(COS_BUCKET, training_id) model = self.getModelInstance(model_id, dataset_desc["features"], dataset_desc["labels"]) index = pd.DatetimeIndex(dataset["index"], tz=pytz.timezone(dataset["tz"])) predictions = self.getPredictions(model, index.astype(np.int64) // 10**9, np.array(dataset["data"]), weights) return json.loads( Dataset.csvtojson(pd.DataFrame(predictions, index), None, None, createId=False)) def getModelInstance(self, model_id, features, labels): if (model_id is not self.modelId): self.modelInstance = self.createModelInstance( model_id, features, labels) self.modelId = model_id return self.modelInstance def createModelInstance(self, model_id, features, labels): model_config = self.mi.get_model(model_id) # Create ML model return Model(features, labels, model_config) # Function to take dates, dataset info for those dates def getPredictions(self, model, timestamps, dataset, weights): # Load timestamps from weights db (or load all weights data) wPeriods = weights["timestamp"].values tsPerPeriod = np.sum(wPeriods == wPeriods[0]) # x = for each dataset timestamp, match latest available weight timestamp latestPeriods = np.zeros(len(timestamps)) uniqueWPeriods = np.unique(wPeriods) # q mask = timestamps >= np.min(uniqueWPeriods) latestPeriods[mask] = [ uniqueWPeriods[uniqueWPeriods <= s][-1] for s in timestamps[mask] ] # for each non-duplicate timestamp in x, load weights into model for that timestamp results = np.empty((len(dataset), tsPerPeriod)) for x in np.unique(latestPeriods): mask = latestPeriods == x if (x == 0): predictions = np.zeros((len(dataset[mask]), tsPerPeriod)) else: # run dataset entries matching that timestamp through model, save results against original timestamps predictions = model.predict( weights[wPeriods == x].values[:, 1:], dataset[mask]) predictions = predictions.reshape( tsPerPeriod, len(dataset[mask]) ).T # WARNING : ONLY WORKS FOR SINGLE LABEL DATA results[mask] = predictions #results = np.nanmean(results, axis=0) return results # This version of the function returns predictions from each trained model that is associated with all previous time periods - rather than a single # prediction for the current (or most recent) time period. # The results from using this tend to be far more stable, but ultimately lower than predictions from models associated with recent time periods. def _getPredictions_Historical(self, model, timestamps, dataset, weights): # Load timestamps from weights db (or load all weights data) wPeriods = weights["timestamp"].values tsPerPeriod = np.sum(wPeriods == wPeriods[0]) # x = for each dataset timestamp, match latest available weight timestamp latestPeriods = np.zeros(len(timestamps)) uniqueWPeriods = np.unique(wPeriods) # q mask = timestamps >= np.min(uniqueWPeriods) latestPeriods[mask] = [ uniqueWPeriods[uniqueWPeriods <= s][-1] for s in timestamps[mask] ] # for each non-duplicate timestamp in x, load weights into model for that timestamp results = [np.array([])] * len(dataset) for x in np.unique(latestPeriods): # print(x) mask = [i for i in range(len(results)) if latestPeriods[i] >= x] predictions = model.predict(weights[wPeriods == x].values[:, 1:], dataset[mask]) scores = predictions.reshape(tsPerPeriod, len( dataset[mask])).T # WARNING : ONLY WORKS FOR SINGLE LABEL DATA for i in range(0, len(mask)): results[mask[i]] = np.append(results[mask[i]], scores[i]) #results[mask] = mlutils.aggregatePredictions([pd.DataFrame(scores)], "vote_unanimous_pred").values #results = np.nanmean(results, axis=0) return results
def setUp(self): self.cred = CredentialsStore() self.mi = MarketInsights(self.cred) fun = Functions(self.cred) self.miassembly = MIAssembly(self.mi, fun)
class APITest(unittest.TestCase): def setUp(self): self.cred = CredentialsStore() self.mi = MarketInsights(self.cred) fun = Functions(self.cred) self.miassembly = MIAssembly(self.mi, fun) def testEndToEndPredictionFromDataset(self): TRAINING_RUN_ID = "94b227b9d7b22c920333aa36d23669c8" DATASET_ID = "4234f0f1b6fcc17f6458696a6cdf5101" #mc = MIModelClient(self.cred) #results = self.miassembly.get_local_predictions_with_dataset_id(mc, DATASET_ID, TRAINING_RUN_ID, start="2016-07-01", end="2016-07-15", debug=True) #results = pd.DataFrame(results["data"], results["index"]) results = self.miassembly.get_predictions_with_dataset_id( DATASET_ID, TRAINING_RUN_ID, start="2016-07-01", end="2016-07-15", debug=True) results = mlutils.aggregatePredictions([results], "mean_all") ''' # Results should look like this Date_Time 2016-07-01 15:00:00-04:00 0.000000e+00 2016-07-05 15:00:00-04:00 0.000000e+00 2016-07-06 15:00:00-04:00 0.000000e+00 2016-07-07 15:00:00-04:00 6.174025e-03 2016-07-08 15:00:00-04:00 8.180070e-01 2016-07-11 15:00:00-04:00 1.000000e+00 2016-07-12 15:00:00-04:00 3.874419e-06 2016-07-13 15:00:00-04:00 9.999999e-01 2016-07-14 15:00:00-04:00 3.974110e-11 2016-07-15 15:00:00-04:00 3.007612e-01 ''' self.assertEqual(np.nansum(results), 3.124945995554477) def testEndToEndPredictionFromRawData(self): TRAINING_RUN_ID = "94b227b9d7b22c920333aa36d23669c8" with open(root_dir + "data/testRawData.json") as data_file: testRawData = json.load(data_file) data = Dataset.jsontocsv(testRawData) data.columns = ["Open", "High", "Low", "Close"] results = self.miassembly.get_predictions_with_raw_data( data, TRAINING_RUN_ID) results = mlutils.aggregatePredictions([results], "mean_all") ''' Date_Time 2016-07-01 15:00:00-04:00 0.000000e+00 2016-07-05 15:00:00-04:00 0.000000e+00 2016-07-06 15:00:00-04:00 0.000000e+00 2016-07-07 15:00:00-04:00 6.174025e-03 2016-07-08 15:00:00-04:00 8.180070e-01 2016-07-11 15:00:00-04:00 1.000000e+00 2016-07-12 15:00:00-04:00 3.874419e-06 2016-07-13 15:00:00-04:00 9.999999e-01 2016-07-14 15:00:00-04:00 3.974110e-11 2016-07-15 15:00:00-04:00 3.007612e-01 ''' self.assertEqual(np.nansum(results), 3.124945995554477) @DeprecationWarning def _test_predictions(self): predictions = pd.read_csv(root_dir + 'data/testPredictions.csv', index_col=0, parse_dates=True, header=None) # Clean up print("Cleaning up") resp = self.mi.delete_predictions("testMkt", "testModelId", debug=False) print("Posting predictions") resp = self.mi.put_predictions(predictions, "testMkt", "testModelId", debug=False) self.assertTrue('success' in resp) resp = self.mi.get_predictions("testMkt", "testModelId") self.assertTrue(predictions.index.equals(resp.index)) self.assertTrue(np.allclose(predictions.values, resp.values)) # Shuffle values and update stored predictions predictions2 = ppl.shuffle(predictions) predictions2.index = predictions.index predictions = predictions2 print("Updating predictions") resp = self.mi.put_predictions(predictions, "testMkt", "testModelId", update=True) self.assertTrue('success' in resp) resp = self.mi.get_predictions("testMkt", "testModelId") self.assertTrue(predictions.index.equals(resp.index)) self.assertTrue(np.allclose(predictions.values, resp.values)) print("Cleaning up") resp = self.mi.delete_predictions("testMkt", "testModelId") resp = self.mi.get_predictions("testMkt", "testModelId") self.assertTrue(resp.empty)
def setUp(self): self.mi = MarketInsights(CredentialsStore()) credStore = CredentialsStore() self.fun = Functions(credStore)