class BatFrame(object): def __init__(self, dataset_url): self.dataset_url = dataset_url self.query = Query(dataset_url) self._time = Time(dataset_url) self._index = Index(self) def __getitem__(self, val): if isinstance(val, str): col = Column(val, self.dataset_url) col.query.mergeQuery(self.query) return col elif isinstance(val, Query): bf = self.copy() # bf.query.addSELECT('*') bf.query.mergeQuery(val) return bf elif isinstance(val, slice): start = val.start stop = val.stop # step = val.step bf = self.copy() # bf.query.addSELECT('*') if start is not None: bf.query.setOFFSET(start) if stop is not None: bf.query.setLIMIT(stop) return bf elif isinstance(val, list): bf = self.copy() for value in val: bf.query.addSELECT("\"{}\"".format(value)) return bf elif isinstance(val, Column): bf = self.copy() bf.query.addWHERE("({})".format(val.execution_name)) return bf @property def columns(self): """Returns a numpy array of the columns name""" return requests.get(self.dataset_url + '/columns').json() @property def rows(self): """Returns a numpy array of the rows name""" bf = self.copy() result = bf.query.executeQuery(format="soa") return result["_rowName"] @property def time(self): copy_time = self._time.copy() return copy_time.query.mergeQuery(self.Query) @property def ix(self): copy_index = self._index.copy() return copy_index def copy(self): bf = BatFrame(self.dataset_url) bf.query = self.query.copy() return bf def toPandas(self): result = self.query.executeQuery(format="aos") if len(result) == 0: return pd.DataFrame() return pd.DataFrame.from_records(result, index="_rowName") def head(self, num_rows=5): bf = self.copy() bf.query.setLIMIT(num_rows) return bf def query(self, query): raise NotImplementedError() def sort(self, value, ascending=True): bf = self.copy() if not isinstance(value, list): value = [value] if not isinstance(ascending, list): ascending = [ascending]*len(value) if len(value) != len(ascending): raise RuntimeError("len(value) != len(ascending)") for by, asc in zip(value, ascending): if asc: sort = "ASC" else: sort = "DESC" bf.query.addORDERBY("\"{}\" {}".format(by, sort)) return bf @property def shape(self): """ Returns (rowCount, valueCount) """ bf = self.copy() content = requests.get(bf.dataset_url).json() rowCount = content['status']['rowCount'] valueCount = content['status']['valueCount'] return (rowCount, valueCount) def __repr__(self): bf = self.copy() bf.query.setLIMIT(40) print(bf.toPandas()) response = requests.get(bf.dataset_url).json() try: rowCount = response['status']['rowCount'] except: rowCount = None if rowCount is not None and rowCount > 40: print("{} rows".format(rowCount)) return ""
class Column(object): def __init__(self, name, dataset_url): """ Parameters ---------- name: string Name of the column. No check is actually done to see if the column exists. dataset_id: The base url where the dataset is located. e.g. localhost:8888/v1/datasets/<dataset_name> """ logging.debug("Instanciating Column with {}".format(name)) self.name = "\"{}\"".format(name) self.execution_name = "\"{}\"".format(name) self.dataset_url = dataset_url self.query = Query(dataset_url) self.query.addSELECT(self.name) @property def values(self): result = self.query.executeQuery(format="soa") if len(result) > 2: raise RuntimeError("Only one column should be returned") colName = [x for x in result.keys() if x != "_rowName"][0] return np.array(result[colName]) def __getitem__(self, val): if isinstance(val, slice): start = val.start stop = val.stop # step = val.step col = self.copy() if start is not None: col.query.setOFFSET(start) if stop is not None: col.query.setLIMIT(stop) return col elif isinstance(val, Query): col = self.copy() col.query.mergeQuery(val) return col elif isinstance(val, str): col = self.copy() col.query.addWHERE("(rowName()='{}')".format(val)) return col #################### # Rich comparison # #################### def _comparison(self, value, operator): """ Parameters ---------- value: Column object or base type The value against which to compare the column. It can either be another column or a base type value (e.g. int) Returns ------- self.query Notes ----- Returning self.query will allow the next object to use this column ops and concatenate something else """ if isinstance(value, Column): self.query.addWHERE("(({}){}({}))".format( self.execution_name, operator, value.execution_name)) elif isinstance(value, str): self.query.addWHERE("(({}){}\'{}\')".format( self.execution_name, operator, value)) else: self.query.addWHERE("(({}){}({}))".format( self.execution_name, operator, value)) copy = self.copy() copy.query.removeSELECT("{}".format(copy.execution_name)) return copy.query def __eq__(self, value): return self._comparison(value, '=') def __ne__(self, value): return self._comparison(value, '!=') def __gt__(self, value): return self._comparison(value, '>') def __ge__(self, value): return self._comparison(value, '>=') def __lt__(self, value): return self._comparison(value, '<') def __le__(self, value): return self._comparison(value, '<=') ################################## # Binary arithmetic operations # ################################## def _binary_arithemtic(self, left, binary, right): """ Parameters ---------- operand: Column object, integer or float Value on which to apply operator to this column binary: char binary arithmetic operator (-, +, *, /, ^, %) Returns ------- self Notes ----- Returning self will allow the next object to use this column ops and concatenate something else """ if isinstance(right, (int, float)): right = right elif isinstance(right, Column): right = right.execution_name else: raise AttributeError( "{} can only be used ".format(binary) + "with integer, float or column") if isinstance(left, (int, float)): left = left elif isinstance(left, Column): left = left.execution_name else: raise AttributeError( "{} can only be used ".format(binary) + "with integer, float or column") copy = self.copy() copy.query.removeSELECT("{}".format(copy.execution_name)) if binary == '^': # POWER needs a different treatment copy.execution_name = "pow({},{})".format(left, right) else: copy.execution_name = "{}{}{}".format(left, binary, right) copy.query.addSELECT(copy.execution_name) return copy def __mul__(self, value): return self._binary_arithemtic(self, '*', value) def __rmul__(self, value): return self._binary_arithemtic(value, '*', self) def __div__(self, value): if isinstance(value, (int, float)) and value == 0: raise ValueError( "Cannot divide by zero. " "Do you really want to explode the planet?") return self._binary_arithemtic(self, '/', value) def __rdiv__(self, value): return self._binary_arithemtic(value, '/', self) def __truediv__(self, value): if isinstance(value, (int, float)) and value == 0: raise ValueError( "Cannot divide by zero. " "Do you really want to explode the planet?") return self._binary_arithemtic(self, '/', value) def __rtruediv__(self, value): return self._binary_arithemtic(value, '/', self) def __sub__(self, value): return self._binary_arithemtic(self, '-', value) def __rsub__(self, value): return self._binary_arithemtic(value, '-', self) def __add__(self, value): return self._binary_arithemtic(self, '+', value) def __radd__(self, value): return self._binary_arithemtic(value, '+', self) def __pow__(self, value): return self._binary_arithemtic(self, '^', value) def __rpow__(self, value): return self._binary_arithemtic(value, '^', self) def __mod__(self, value): return self._binary_arithemtic(self, '%', value) def __rmod__(self, value): return self._binary_arithemtic(value, '%', self) def __or__(self, value): col = self.copy() left = self.execution_name right = value col.query.removeSELECT(left) if isinstance(right, Column): right = value.execution_name col.query.removeSELECT(right) elif isinstance(right, Query): right = right.WHERE col.query.addWHERE('(({}) OR ({}))'.format(left, right)) return col.query def __and__(self, value): col = self.copy() left = self.execution_name right = value col.query.removeSELECT(left) if isinstance(right, Column): right = value.execution_name col.query.removeSELECT(right) elif isinstance(right, Query): right = right.WHERE col.query.addWHERE('(({}) AND ({}))'.format(left, right)) return col.query def __rand__(self, value): col = self.copy() left = self.execution_name right = value col.query.removeSELECT(left) if isinstance(right, Column): right = value.execution_name col.query.removeSELECT(right) elif isinstance(right, Query): right = right.WHERE col.query.addWHERE('(({}) AND ({}))'.format(right, left)) def __ror__(self, value): col = self.copy() left = self.execution_name right = value col.query.removeSELECT(left) if isinstance(right, Column): right = value.execution_name col.query.removeSELECT(right) elif isinstance(right, Query): right = right.WHERE col.query.addWHERE('(({}) OR ({}))'.format(right, left)) return col.query ################################# # Unary arithmetic operations # ################################# def _unary_arithmetic(self, unary): """ Parameters ---------- unary: char Unary arithmetic operator (-, +) to be applied to this column Returns ------- self Notes ----- Returning self will allow the next object to use this column ops and concatenate something else """ copy = self.copy() copy.query.removeSELECT("{}".format(copy.execution_name)) copy.execution_name = "{}({})".format(unary, self.execution_name) copy.query.addSELECT(copy.execution_name) return copy def __neg__(self): return self._unary_arithmetic('-') def __pos__(self): raise NotImplementedError() def __invert__(self): copy = self.copy() copy.execution_name = "NOT {}".format(copy.execution_name) return copy def __abs__(self): raise NotImplementedError() ############# # Casting # ############# def __float__(self): raise NotImplementedError() def __int__(self): raise NotImplementedError() def __long__(self): raise NotImplementedError() ########### # Other # ########### def __iter__(self): result = self.query.executeQuery(format="soa") if len(result) > 2: raise RuntimeError("Only one column should be returned") colName = [x for x in result.keys() if x != "_rowName"][0] values = result[colName] i = 0 while i < len(values): yield values[i] i += 1 def max(self): copy = self.copy() copy.query.removeSELECT("{}".format(copy.execution_name)) copy.execution_name = "max({})".format(self.execution_name) copy.query.addSELECT(copy.execution_name) copy.query.addGROUPBY(1) result = copy.query.executeQuery(format="table") return result[1][1] def min(self): copy = self.copy() copy.query.removeSELECT("{}".format(copy.execution_name)) copy.execution_name = "min({})".format(self.execution_name) copy.query.addSELECT(copy.execution_name) copy.query.addGROUPBY(1) result = copy.query.executeQuery(format="table") return result[1][1] def copy(self): name = self.name[1:-1] # Removing the surrounding '' col = Column(name, self.dataset_url) col.execution_name = self.execution_name col.query = self.query.copy() return col def count(self): """Return number of non-NA/null observations in the Series""" raise NotImplementedError() def head(self, n=5): """Returns first n rows""" col = self.copy() col.query.setLIMIT(n) return col.toPandas() def isnull(self): raise NotImplementedError() def isin(self, values): raise NotImplementedError() def value_counts(self): raise NotImplementedError() def unique(self): if self.name == self.execution_name: url = self.dataset_url + '/columns/{}/values'.format( self.name[1:-1]) logging.debug("Getting values at {}".format(url)) return requests.get(url).json() else: result = self.query.executeQuery(format="soa") if len(result) > 2: raise RuntimeError("Only one column should be returned") colName = [x for x in result.keys() if x != "_rowName"][0] return set(result[colName]) def sort(self, ascending=True): col = self.copy() if ascending: sort = "ASC" else: sort = "DESC" col.query.addORDERBY("{} {}".format(col.execution_name, sort)) return col def toPandas(self): result = self.query.executeQuery(format="soa") if len(result) > 2: raise RuntimeError("Only one column should be returned") colName = [x for x in result.keys() if x != "_rowName"][0] values = result[colName] rowName = result["_rowName"] if len(values) > 0: s = pd.Series(values, index=rowName) else: s = pd.Series() return s def __repr__(self): col = self.copy() col.query.setLIMIT(40) print(col.toPandas()) response = requests.get(col.dataset_url).json() try: rowCount = response['status']['rowCount'] except: rowCount = None if rowCount is not None and rowCount > 40: print("{} rows".format(rowCount)) return ""