Exemple #1
0
class PipelineTest(unittest.TestCase):
    def setUp(self):
        self.mi = MarketInsights(CredentialsStore())
        credStore = CredentialsStore()
        self.fun = Functions(credStore)

    def testDatasetGeneration(self):

        DATASET_ID = "4234f0f1b6fcc17f6458696a6cdf5101"

        dataset_desc_file = root_dir + "../../../marketinsights-data/datasets/WallSt-FinalTradingHour.json"
        with open(dataset_desc_file) as data_file:
            dataset_desc = json.load(data_file)["dataset_desc"]

        # Test id generation
        self.assertEqual(Dataset.generateId(dataset_desc, "DOW"), DATASET_ID)

        ds, _ = self.mi.get_dataset_by_id(DATASET_ID)
        ds = ds["2013-01-01":"2017-05-18"]

        expectedDS = pandas.read_csv(root_dir + "testDataset_DOW.csv",
                                     index_col=0,
                                     parse_dates=True,
                                     float_precision='round_trip')
        expectedDS.index = expectedDS.index.tz_localize("UTC").tz_convert(
            "US/Eastern")
        expectedDS.columns = ds.columns

        # Test correct dataset output
        self.assertTrue(expectedDS.equals(ds))

    def testPipelineAPI(self):

        dataset_desc_file = root_dir + "../../../marketinsights-data/datasets/WallSt-FinalTradingHour.json"
        with open(dataset_desc_file) as data_file:
            dataset_desc = json.load(data_file)["dataset_desc"]

        ppl_desc = dataset_desc["pipeline"]["pipeline_desc"]

        with open(root_dir + "data/testRawData.json") as data_file:
            testRawData = json.load(data_file)

        data = Dataset.jsontocsv(testRawData)
        data.columns = ["Open", "High", "Low", "Close"]

        csvData = {
            "data": json.loads(data.to_json(orient='split',
                                            date_format="iso")),
            "dataset": ppl_desc
        }
        ds = self.fun.call_function("marketdirection", csvData)

        # Test correct pipeline output
        self.assertTrue(np.allclose(ds[0][0], 0.568182))
Exemple #2
0
    def __init__(self,
                 name,
                 env,
                 credstore,
                 mi_models,
                 aggMethod,
                 threshold=0,
                 barOnly=False,
                 debug=False):
        Model.__init__(self, name, env)

        self.miassembly = MIAssembly(MarketInsights(credstore),
                                     Functions(credstore))
        self.modelConfig = mi_models
        self.aggMethod = aggMethod
        self.threshold = threshold
        self.barOnly = barOnly
        self.debug = debug
        return
Exemple #3
0
    def __init__(self,
                 name,
                 env,
                 credstore,
                 dataset_id,
                 training_run_id,
                 threshold=0,
                 barOnly=False,
                 debug=False):
        Model.__init__(self, name, env)

        self.miassembly = MIAssembly(MarketInsights(credstore),
                                     Functions(credstore))
        self.dataset_id = dataset_id
        self.training_run_id = training_run_id
        self.threshold = threshold
        self.debug = debug
        self.barOnly = barOnly

        return
Exemple #4
0
    def setUp(self):

        #
        # Get dataset from MI API #
        #

        print("Loading data...")

        mi = MarketInsights(cred)
        fun = Functions(cred)
        self.miassembly = MIAssembly(mi, fun)

        TRAINING_RUN["id"] = cos.generateKey(
            [str(TRAINING_RUN["datasets"]),
             str(TRAINING_RUN["model_id"])])

        mi.put_training_run(TRAINING_RUN)

        self.CONFIG = mi.get_model(MODEL_ID)
        TRN_CNF = self.CONFIG["training"]

        print("Creating model...")
        # Create ML model
        self.ffnn = Model(NUM_FEATURES, NUM_LABELS, self.CONFIG)

        mkt1, mkt1_desc = mi.get_dataset_by_id(DATASET_ID1)
        mkt2, mkt2_desc = mi.get_dataset_by_id(DATASET_ID2)

        # Crop training dates
        if "training_end_date" in TRN_CNF:
            mkt1 = mkt1[
                TRN_CNF["training_start_date"]:TRN_CNF["training_end_date"]]
            mkt2 = mkt2[
                TRN_CNF["training_start_date"]:TRN_CNF["training_end_date"]]

        # Interleave (part of the "added insight" for this model)
        self.mkt1, self.mkt2, self.isect = ppl.intersect(mkt1, mkt2)
        self.dataset = ppl.interleave(self.mkt1, self.mkt2)

        self.TRAINING_SET_SIZE = TRN_CNF["training_window_size"]
        self.TEST_SET_SIZE = len(self.dataset) - self.TRAINING_SET_SIZE
        self.WINDOW_SIZE = self.TRAINING_SET_SIZE

        _, self.test_y = ppl.splitCol(self.dataset[self.TRAINING_SET_SIZE:],
                                      NUM_FEATURES)
Exemple #5
0
 def __init__(self, cred):
     self.cos = CloudObjectStore(cred)
     self.mi = MarketInsights(cred)
Exemple #6
0
class MIModelClient():

    modelId = None
    modelInstance = None

    def __init__(self, cred):
        self.cos = CloudObjectStore(cred)
        self.mi = MarketInsights(cred)

    def score(self, training_id, dataset):

        training_run = self.mi.get_training_run(training_id)
        if (not training_run):
            return "No Training Id found"
        if (not self.cos.keyExists(COS_BUCKET, training_id)):
            return "No trained weights found for this training id"

        model_id = training_run["model_id"]
        _, dataset_desc = self.mi.get_dataset_by_id(
            training_run["datasets"]
            [0])  # TODO this is too heavyweight just to get the desc
        weights = self.cos.get_csv(COS_BUCKET, training_id)
        model = self.getModelInstance(model_id, dataset_desc["features"],
                                      dataset_desc["labels"])
        index = pd.DatetimeIndex(dataset["index"],
                                 tz=pytz.timezone(dataset["tz"]))
        predictions = self.getPredictions(model,
                                          index.astype(np.int64) // 10**9,
                                          np.array(dataset["data"]), weights)
        return json.loads(
            Dataset.csvtojson(pd.DataFrame(predictions, index),
                              None,
                              None,
                              createId=False))

    def getModelInstance(self, model_id, features, labels):
        if (model_id is not self.modelId):
            self.modelInstance = self.createModelInstance(
                model_id, features, labels)
            self.modelId = model_id
        return self.modelInstance

    def createModelInstance(self, model_id, features, labels):
        model_config = self.mi.get_model(model_id)
        # Create ML model
        return Model(features, labels, model_config)

    # Function to take dates, dataset info for those dates
    def getPredictions(self, model, timestamps, dataset, weights):

        # Load timestamps from weights db (or load all weights data)
        wPeriods = weights["timestamp"].values
        tsPerPeriod = np.sum(wPeriods == wPeriods[0])

        # x = for each dataset timestamp, match latest available weight timestamp
        latestPeriods = np.zeros(len(timestamps))
        uniqueWPeriods = np.unique(wPeriods)  # q
        mask = timestamps >= np.min(uniqueWPeriods)
        latestPeriods[mask] = [
            uniqueWPeriods[uniqueWPeriods <= s][-1] for s in timestamps[mask]
        ]

        # for each non-duplicate timestamp in x, load weights into model for that timestamp
        results = np.empty((len(dataset), tsPerPeriod))
        for x in np.unique(latestPeriods):
            mask = latestPeriods == x
            if (x == 0):
                predictions = np.zeros((len(dataset[mask]), tsPerPeriod))
            else:
                # run dataset entries matching that timestamp through model, save results against original timestamps
                predictions = model.predict(
                    weights[wPeriods == x].values[:, 1:], dataset[mask])
                predictions = predictions.reshape(
                    tsPerPeriod, len(dataset[mask])
                ).T  # WARNING : ONLY WORKS FOR SINGLE LABEL DATA
            results[mask] = predictions

        #results = np.nanmean(results, axis=0)

        return results

    # This version of the function returns predictions from each trained model that is associated with all previous time periods - rather than a single
    # prediction for the current (or most recent) time period.
    # The results from using this tend to be far more stable, but ultimately lower than predictions from models associated with recent time periods.
    def _getPredictions_Historical(self, model, timestamps, dataset, weights):

        # Load timestamps from weights db (or load all weights data)
        wPeriods = weights["timestamp"].values
        tsPerPeriod = np.sum(wPeriods == wPeriods[0])

        # x = for each dataset timestamp, match latest available weight timestamp
        latestPeriods = np.zeros(len(timestamps))
        uniqueWPeriods = np.unique(wPeriods)  # q
        mask = timestamps >= np.min(uniqueWPeriods)
        latestPeriods[mask] = [
            uniqueWPeriods[uniqueWPeriods <= s][-1] for s in timestamps[mask]
        ]

        # for each non-duplicate timestamp in x, load weights into model for that timestamp
        results = [np.array([])] * len(dataset)
        for x in np.unique(latestPeriods):
            # print(x)

            mask = [i for i in range(len(results)) if latestPeriods[i] >= x]
            predictions = model.predict(weights[wPeriods == x].values[:, 1:],
                                        dataset[mask])
            scores = predictions.reshape(tsPerPeriod, len(
                dataset[mask])).T  # WARNING : ONLY WORKS FOR SINGLE LABEL DATA

            for i in range(0, len(mask)):
                results[mask[i]] = np.append(results[mask[i]], scores[i])

            #results[mask] = mlutils.aggregatePredictions([pd.DataFrame(scores)], "vote_unanimous_pred").values

        #results = np.nanmean(results, axis=0)

        return results
Exemple #7
0
    def setUp(self):

        self.cred = CredentialsStore()
        self.mi = MarketInsights(self.cred)
        fun = Functions(self.cred)
        self.miassembly = MIAssembly(self.mi, fun)
Exemple #8
0
class APITest(unittest.TestCase):
    def setUp(self):

        self.cred = CredentialsStore()
        self.mi = MarketInsights(self.cred)
        fun = Functions(self.cred)
        self.miassembly = MIAssembly(self.mi, fun)

    def testEndToEndPredictionFromDataset(self):

        TRAINING_RUN_ID = "94b227b9d7b22c920333aa36d23669c8"
        DATASET_ID = "4234f0f1b6fcc17f6458696a6cdf5101"

        #mc = MIModelClient(self.cred)
        #results = self.miassembly.get_local_predictions_with_dataset_id(mc, DATASET_ID, TRAINING_RUN_ID, start="2016-07-01", end="2016-07-15", debug=True)
        #results = pd.DataFrame(results["data"], results["index"])
        results = self.miassembly.get_predictions_with_dataset_id(
            DATASET_ID,
            TRAINING_RUN_ID,
            start="2016-07-01",
            end="2016-07-15",
            debug=True)
        results = mlutils.aggregatePredictions([results], "mean_all")
        '''
        # Results should look like this
		Date_Time
		2016-07-01 15:00:00-04:00  0.000000e+00
		2016-07-05 15:00:00-04:00  0.000000e+00
		2016-07-06 15:00:00-04:00  0.000000e+00
		2016-07-07 15:00:00-04:00  6.174025e-03
		2016-07-08 15:00:00-04:00  8.180070e-01
		2016-07-11 15:00:00-04:00  1.000000e+00
		2016-07-12 15:00:00-04:00  3.874419e-06
		2016-07-13 15:00:00-04:00  9.999999e-01
		2016-07-14 15:00:00-04:00  3.974110e-11
		2016-07-15 15:00:00-04:00  3.007612e-01
		'''

        self.assertEqual(np.nansum(results), 3.124945995554477)

    def testEndToEndPredictionFromRawData(self):

        TRAINING_RUN_ID = "94b227b9d7b22c920333aa36d23669c8"

        with open(root_dir + "data/testRawData.json") as data_file:
            testRawData = json.load(data_file)

        data = Dataset.jsontocsv(testRawData)
        data.columns = ["Open", "High", "Low", "Close"]

        results = self.miassembly.get_predictions_with_raw_data(
            data, TRAINING_RUN_ID)

        results = mlutils.aggregatePredictions([results], "mean_all")
        '''
		Date_Time
		2016-07-01 15:00:00-04:00  0.000000e+00
		2016-07-05 15:00:00-04:00  0.000000e+00
		2016-07-06 15:00:00-04:00  0.000000e+00
		2016-07-07 15:00:00-04:00  6.174025e-03
		2016-07-08 15:00:00-04:00  8.180070e-01
		2016-07-11 15:00:00-04:00  1.000000e+00
		2016-07-12 15:00:00-04:00  3.874419e-06
		2016-07-13 15:00:00-04:00  9.999999e-01
		2016-07-14 15:00:00-04:00  3.974110e-11
		2016-07-15 15:00:00-04:00  3.007612e-01
		'''

        self.assertEqual(np.nansum(results), 3.124945995554477)

    @DeprecationWarning
    def _test_predictions(self):
        predictions = pd.read_csv(root_dir + 'data/testPredictions.csv',
                                  index_col=0,
                                  parse_dates=True,
                                  header=None)

        # Clean up
        print("Cleaning up")
        resp = self.mi.delete_predictions("testMkt",
                                          "testModelId",
                                          debug=False)

        print("Posting predictions")
        resp = self.mi.put_predictions(predictions,
                                       "testMkt",
                                       "testModelId",
                                       debug=False)
        self.assertTrue('success' in resp)

        resp = self.mi.get_predictions("testMkt", "testModelId")
        self.assertTrue(predictions.index.equals(resp.index))
        self.assertTrue(np.allclose(predictions.values, resp.values))

        # Shuffle values and update stored predictions
        predictions2 = ppl.shuffle(predictions)
        predictions2.index = predictions.index
        predictions = predictions2

        print("Updating predictions")
        resp = self.mi.put_predictions(predictions,
                                       "testMkt",
                                       "testModelId",
                                       update=True)
        self.assertTrue('success' in resp)

        resp = self.mi.get_predictions("testMkt", "testModelId")
        self.assertTrue(predictions.index.equals(resp.index))
        self.assertTrue(np.allclose(predictions.values, resp.values))

        print("Cleaning up")
        resp = self.mi.delete_predictions("testMkt", "testModelId")

        resp = self.mi.get_predictions("testMkt", "testModelId")
        self.assertTrue(resp.empty)
Exemple #9
0
 def setUp(self):
     self.mi = MarketInsights(CredentialsStore())
     credStore = CredentialsStore()
     self.fun = Functions(credStore)