def read_id(self, id, start=None, end=None, form=None): if form is None: form = 'csv' if not self.has_id(id): raise ValueError("Id %s doesn't exist!" % id) meta = db.loadMetadata(self.store, id) if start < meta['start'] or end > meta['end']: raise ValueError("Given interval (%s, %s) does not fit in the property's interval (%s, %s)." % \ (str(start), str(end), str(meta['start']), str(meta['end']))) data = db.loadData(self.store, id, start, end, filter=True) #due to database limitations, the values of the dataframe MAY be pickled (if it's a distribution). Let's unpickle them data = decodeDataframe(data) #debug & info print(data.head(3)) print(data.tail(3)) if form == 'csv': out = data.to_csv() elif form == 'json': out = data.to_json() elif form == 'pickle': out = pickle.dumps(data, pickle.HIGHEST_PROTOCOL) else: raise ValueError("Given format %s is unsupported!" % form) return out
def generateDataset(modelName, propertyNames, labelsType, start=None, end=None): print("Generating dataset for properties ", propertyNames, "and using model", modelName, "for range", start, end) model = None #get the model instance for mod in models: if mod.name == modelName: model = mod if model is None: print("Error: Couldn't find model ", modelName) return properties = [] #make sure we don't go off bounds for any property start, end = db.getMasterInterval(chunkStore, propertyNames, start, end) #load the needed properties for prop in propertyNames: data = db.loadData(chunkStore, prop, start, end, True, CLOSED_OPEN) if prop == 'balanceDistribution': print("Running numpy array Arctic workaround for prop %s..." % prop) data[prop] = data[prop].apply(lambda x: pickle.loads(codecs.decode(x.encode(), "base64"))) properties.append(data) for prop in properties: if len(properties[0]) != len(prop): print("Error: Length mismatch in the data properties.") return #feed the model the properties and let it generate dataset, dates, nextPrices = model.generate(properties) labels, dates = generateLabels(dates, nextPrices, db.loadData(chunkStore, labelKey, start, None, True), labelsType) if len(dataset) != len(labels): #if we have a length mismatch, probably due to insufficient data for the last label print("Mismatch in lengths of dataset and labels, removing excessive entries") dataset = dataset[:len(labels)] #remove dataframes for which we have no labels return (dataset, labels, dates)
def test_labels(): lib = db.getChunkstore() #get the whole history of the course course = db.loadData(lib, db.dbKeys['tick'], None, None, False) filename = "data/dataset_test.pickle" start = None end = None #generate a sample dataset ds.run("matrix", "openPrice,closePrice,gasPrice", start, end, filename, "boolean", '1', False) sleep(1) #delay between saving and reading with open(filename, 'rb') as f: res = pickle.load(f) #load the result i = 0 indices = course.index.values for j, date in enumerate(res['dates']): # i = course.index[course['date'] == date] #while course.get_value(indices[i], 'date') != date: # i+=1 currPrice = course.get_value(i[0], 'close') nextPrice = course.get_value(i[0] + 1, 'close') #print("Checking prices %s %s with value of label %s" % (currPrice, nextPrice, res['labels'][j])) if res['labels'][j] != (nextPrice > currPrice): print( "Debug info: Date %s at course index %s (len=%s). Curr / next prices are %s and %s." % (date, i, len(course), currPrice, nextPrice)) assert False assert True
def loadDataForTick(lib, start, end): return [db.loadData(lib, key, start, end, True) for key in db.dbKeys]
def generateDataset(modelName, propertyNames, targetNames, labelsType='full', start=None, end=None, args={}, preprocess={}): print( "Generating dataset for properties %s, targets %s, model %s and range from %s to %s." % (str(propertyNames), str(targetNames), modelName, str(start), str(end))) for arr in [propertyNames, targetNames]: while '' in arr: arr.remove('') model = None #get the model instance for mod in models: if mod.name == modelName: model = mod if model is None: print("Error: Couldn't find model ", modelName) return properties = [] targets = [] #make sure we don't go off bounds for any property start, end = db.getMasterInterval(chunkStore, propertyNames + targetNames, start, end) #load the needed properties for dataType, inputData in [('property', propertyNames), ('target', targetNames)]: for prop in inputData: data = db.loadData(chunkStore, prop, start, end, True, CLOSED_OPEN) if type( data.iloc[0][prop] ) == str: #if the property values have been encoded, decode them print("Running numpy array Arctic workaround for prop %s..." % prop) data[prop] = data[prop].apply(lambda x: db.decodeObject(x)) if prop in preprocess: settings = preprocess[prop] if 'scale' in settings: if settings['scale'] == 'log2': scaleF = np.log2 elif settings['scale'] == 'log10': scaleF = np.log10 else: raise ValueError( "Unsupported scale type %s for preprocessing of property %s!" % (settings['scale'], prop)) def scale(val): global globalMin if globalMin < 0: #if we have relative values val -= globalMin #turn all negatives to positives val = scaleF(val) val[val < 0] = 0 #log if 0 is -inf return val else: scale = lambda x: x #no scaling xAxis = ':' yAxis = ':' if 'slices' in settings: xAxis, yAxis = settings['slices'] strToSlice = lambda string: slice(*map( lambda x: int(x.strip()) if x.strip() else None, string.split(':'))) xAxis = strToSlice(xAxis) yAxis = strToSlice(yAxis) print("Slicing data by %s and %s." % (str(xAxis), str(yAxis))) data[prop] = data[prop].apply( lambda x: x[yAxis, xAxis]) # trim global globalMin #we need the minimum single value, to see if the property is realtive or not globalMin = 0 def findMin(x): global globalMin globalMin = min(globalMin, np.min(x)) return x data[prop].apply(findMin) data[prop] = data[prop].apply(lambda x: scale(x)) # scale if dataType == 'property': properties.append(data) if dataType == 'target': targets.append(data) for prop in properties: if len(properties[0]) != len(prop): raise ValueError("Error: Length mismatch in the data properties.") #feed the model the properties and let it generate dataset, dates, nextPrices, targetNorms = model.generate( properties, targets, args) labels, dates = generateLabels( dates, nextPrices, db.loadData(chunkStore, labelKey, start, None, True), labelsType) if len(dataset) != len( labels ): #if we have a length mismatch, probably due to insufficient data for the last label print( "Mismatch in lengths of dataset and labels, removing excessive entries" ) dataset = dataset[:len( labels)] #remove dataframes for which we have no labels package = { 'dataset': dataset, 'dates': dates, 'labels': nextPrices, 'normalization': targetNorms } return package
default=None, help='The end date. YYYY-MM-DD-HH') args, _ = parser.parse_known_args() start = dateutil.parser.parse( args.start) if args.start is not None else None end = dateutil.parser.parse(args.end) if args.end is not None else None if args.type == 'file': with open(args.data, 'rb') as f: data = pickle.load(f) if type(data) != list: data = [data] #turn to single element list for dataset in data: values = dataset['dataset'][:, -1, args.index] dates = dataset['dates'] print(values, dates) plot(values, dates, 'Value of ' + args.data) plot(dataset['labels'], dataset['dates'], 'Correct labels') elif args.type == 'key': data = db.loadData(db.getChunkstore(), args.data, start, end, True) values = data[args.data].values dates = data['date'].values plot(values, dates, 'Value of ' + args.data)
frame = dataset['dataset'][ -1, -1, :, :] #shape is samples, layers, width, height print(frame) print(dataset['dataset'].shape) print(dataset['dates'][-1]) plotImage(frame) if args.target: plot(dataset['labels'], dataset['dates'], 'Correct labels') elif args.type == 'key': prop = args.data data = db.loadData(db.getChunkstore(), prop, start, end, True) if type( data.iloc[0][prop] ) == str: #if the property values have been encoded, decode them print("Running numpy array Arctic workaround for prop %s..." % prop) data[prop] = data[prop].apply(lambda x: db.decodeObject(x)) values = data[prop].values dates = data['date'].values if type(values[0]) != np.ndarray: plot(values, dates, 'Value of ' + prop) else: #if we are dealing with complex property, visualize it if args.renderTimelapse is not None: