Ejemplo n.º 1
0
    def read_id(self, id, start=None, end=None, form=None):
        if form is None:
            form = 'csv'

        if not self.has_id(id):
            raise ValueError("Id %s doesn't exist!" % id)

        meta = db.loadMetadata(self.store, id)
        if start < meta['start'] or end > meta['end']:
            raise ValueError("Given interval (%s, %s) does not fit in the property's interval (%s, %s)." % \
            (str(start), str(end), str(meta['start']), str(meta['end'])))

        data = db.loadData(self.store, id, start, end, filter=True)

        #due to database limitations, the values of the dataframe MAY be pickled (if it's a distribution). Let's unpickle them
        data = decodeDataframe(data)

        #debug & info
        print(data.head(3))
        print(data.tail(3))

        if form == 'csv':
            out = data.to_csv()
        elif form == 'json':
            out = data.to_json()
        elif form == 'pickle':
            out = pickle.dumps(data, pickle.HIGHEST_PROTOCOL)
        else:
            raise ValueError("Given format %s is unsupported!" % form)

        return out
def generateDataset(modelName, propertyNames, labelsType, start=None, end=None):
	print("Generating dataset for properties ", propertyNames, "and using model", modelName, "for range", start, end)

	model = None

	#get the model instance
	for mod in models:
		if mod.name == modelName:
			model = mod

	if model is None:
		print("Error: Couldn't find model ", modelName)
		return

	properties = []

	#make sure we don't go off bounds for any property
	start, end = db.getMasterInterval(chunkStore, propertyNames, start, end)

	#load the needed properties
	for prop in propertyNames:
		data = db.loadData(chunkStore, prop, start, end, True, CLOSED_OPEN)

		if prop == 'balanceDistribution':
			print("Running numpy array Arctic workaround for prop %s..." % prop)
			data[prop] = data[prop].apply(lambda x: pickle.loads(codecs.decode(x.encode(), "base64")))
		properties.append(data)

	for prop in properties:
		if len(properties[0]) != len(prop):
			print("Error: Length mismatch in the data properties.")
			return

	#feed the model the properties and let it generate
	dataset, dates, nextPrices =  model.generate(properties)

	labels, dates = generateLabels(dates, nextPrices, db.loadData(chunkStore, labelKey, start, None, True), labelsType)

	if len(dataset) != len(labels): #if we have a length mismatch, probably due to insufficient data for the last label
		print("Mismatch in lengths of dataset and labels, removing excessive entries")
		dataset = dataset[:len(labels)] #remove dataframes for which we have no labels

	return (dataset, labels, dates)
Ejemplo n.º 3
0
def test_labels():
    lib = db.getChunkstore()

    #get the whole history of the course
    course = db.loadData(lib, db.dbKeys['tick'], None, None, False)

    filename = "data/dataset_test.pickle"

    start = None
    end = None

    #generate a sample dataset
    ds.run("matrix", "openPrice,closePrice,gasPrice", start, end, filename,
           "boolean", '1', False)

    sleep(1)  #delay between saving and reading

    with open(filename, 'rb') as f:
        res = pickle.load(f)  #load the result
    i = 0

    indices = course.index.values

    for j, date in enumerate(res['dates']):
        #
        i = course.index[course['date'] == date]
        #while course.get_value(indices[i], 'date') != date:
        #	i+=1

        currPrice = course.get_value(i[0], 'close')
        nextPrice = course.get_value(i[0] + 1, 'close')

        #print("Checking prices %s %s with value of label %s" % (currPrice, nextPrice, res['labels'][j]))

        if res['labels'][j] != (nextPrice > currPrice):
            print(
                "Debug info: Date %s at course index %s (len=%s). Curr / next prices are %s and %s."
                % (date, i, len(course), currPrice, nextPrice))
            assert False

    assert True
Ejemplo n.º 4
0
def loadDataForTick(lib, start, end):
    return [db.loadData(lib, key, start, end, True) for key in db.dbKeys]
Ejemplo n.º 5
0
def generateDataset(modelName,
                    propertyNames,
                    targetNames,
                    labelsType='full',
                    start=None,
                    end=None,
                    args={},
                    preprocess={}):
    print(
        "Generating dataset for properties %s, targets %s, model %s and range from %s to %s."
        % (str(propertyNames), str(targetNames), modelName, str(start),
           str(end)))

    for arr in [propertyNames, targetNames]:
        while '' in arr:
            arr.remove('')

    model = None

    #get the model instance
    for mod in models:
        if mod.name == modelName:
            model = mod

    if model is None:
        print("Error: Couldn't find model ", modelName)
        return

    properties = []
    targets = []

    #make sure we don't go off bounds for any property
    start, end = db.getMasterInterval(chunkStore, propertyNames + targetNames,
                                      start, end)

    #load the needed properties
    for dataType, inputData in [('property', propertyNames),
                                ('target', targetNames)]:
        for prop in inputData:
            data = db.loadData(chunkStore, prop, start, end, True, CLOSED_OPEN)

            if type(
                    data.iloc[0][prop]
            ) == str:  #if the property values have been encoded, decode them
                print("Running numpy array Arctic workaround for prop %s..." %
                      prop)
                data[prop] = data[prop].apply(lambda x: db.decodeObject(x))

            if prop in preprocess:
                settings = preprocess[prop]
                if 'scale' in settings:
                    if settings['scale'] == 'log2':
                        scaleF = np.log2
                    elif settings['scale'] == 'log10':
                        scaleF = np.log10
                    else:
                        raise ValueError(
                            "Unsupported scale type %s for preprocessing of property %s!"
                            % (settings['scale'], prop))

                    def scale(val):
                        global globalMin

                        if globalMin < 0:  #if we have relative values
                            val -= globalMin  #turn all negatives to positives

                        val = scaleF(val)
                        val[val < 0] = 0  #log if 0 is -inf

                        return val
                else:
                    scale = lambda x: x  #no scaling

                xAxis = ':'
                yAxis = ':'

                if 'slices' in settings:
                    xAxis, yAxis = settings['slices']

                strToSlice = lambda string: slice(*map(
                    lambda x: int(x.strip())
                    if x.strip() else None, string.split(':')))

                xAxis = strToSlice(xAxis)
                yAxis = strToSlice(yAxis)

                print("Slicing data by %s and %s." % (str(xAxis), str(yAxis)))

                data[prop] = data[prop].apply(
                    lambda x: x[yAxis, xAxis])  # trim

                global globalMin  #we need the minimum single value, to see if the property is realtive or not
                globalMin = 0

                def findMin(x):
                    global globalMin
                    globalMin = min(globalMin, np.min(x))
                    return x

                data[prop].apply(findMin)

                data[prop] = data[prop].apply(lambda x: scale(x))  # scale

            if dataType == 'property':
                properties.append(data)
            if dataType == 'target':
                targets.append(data)

    for prop in properties:
        if len(properties[0]) != len(prop):
            raise ValueError("Error: Length mismatch in the data properties.")

    #feed the model the properties and let it generate
    dataset, dates, nextPrices, targetNorms = model.generate(
        properties, targets, args)

    labels, dates = generateLabels(
        dates, nextPrices, db.loadData(chunkStore, labelKey, start, None,
                                       True), labelsType)

    if len(dataset) != len(
            labels
    ):  #if we have a length mismatch, probably due to insufficient data for the last label
        print(
            "Mismatch in lengths of dataset and labels, removing excessive entries"
        )
        dataset = dataset[:len(
            labels)]  #remove dataframes for which we have no labels

    package = {
        'dataset': dataset,
        'dates': dates,
        'labels': nextPrices,
        'normalization': targetNorms
    }

    return package
Ejemplo n.º 6
0
                        default=None,
                        help='The end date. YYYY-MM-DD-HH')

    args, _ = parser.parse_known_args()

    start = dateutil.parser.parse(
        args.start) if args.start is not None else None
    end = dateutil.parser.parse(args.end) if args.end is not None else None

    if args.type == 'file':
        with open(args.data, 'rb') as f:
            data = pickle.load(f)
            if type(data) != list:
                data = [data]  #turn to single element list

            for dataset in data:
                values = dataset['dataset'][:, -1, args.index]
                dates = dataset['dates']

                print(values, dates)

                plot(values, dates, 'Value of ' + args.data)
                plot(dataset['labels'], dataset['dates'], 'Correct labels')

    elif args.type == 'key':
        data = db.loadData(db.getChunkstore(), args.data, start, end, True)
        values = data[args.data].values
        dates = data['date'].values

        plot(values, dates, 'Value of ' + args.data)
                    frame = dataset['dataset'][
                        -1, -1, :, :]  #shape is samples, layers, width, height
                    print(frame)

                    print(dataset['dataset'].shape)

                    print(dataset['dates'][-1])

                    plotImage(frame)
                    if args.target:
                        plot(dataset['labels'], dataset['dates'],
                             'Correct labels')
    elif args.type == 'key':
        prop = args.data

        data = db.loadData(db.getChunkstore(), prop, start, end, True)

        if type(
                data.iloc[0][prop]
        ) == str:  #if the property values have been encoded, decode them
            print("Running numpy array Arctic workaround for prop %s..." %
                  prop)
            data[prop] = data[prop].apply(lambda x: db.decodeObject(x))

        values = data[prop].values
        dates = data['date'].values

        if type(values[0]) != np.ndarray:
            plot(values, dates, 'Value of ' + prop)
        else:  #if we are dealing with complex property, visualize it
            if args.renderTimelapse is not None: