def blz_llvec_fun(act, pred, output_vector_type ="numpy"): epsilon = 1e-15 pred[pred<epsilon] = epsilon pred[pred > (1-epsilon) ] = 1-epsilon bt = blz.btable([act,pred],names=["y","py"]) if output_vector_type =="numpy": return bt.eval("-y*log(py) - (1-y)*log(1-py)",out_flavor="numpy") else: return bt.eval("-y*log(py) - (1-y)*log(1-py)")
def test07(self): """Testing `wherechunks` method with a `limit`, `skip` parameter""" N, M = int(1e4), 101 ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2', limit=N-M-2, skip=M): l += len(block) s += block['f0'].sum() self.assert_(l == N - M - 2) self.assert_(s == np.arange(M+1, N-1).sum())
def test05(self): """Testing `wherechunks` method with a `limit` parameter""" N, M = int(1e4), 101 ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2', limit=M): l += len(block) s += block['f0'].sum() self.assert_(l == M) self.assert_(s == M * ((M + 1) / 2)) # Gauss summation formula
def test00(self): """Testing `wherechunks` method with only an expression""" N = int(1e4) ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2'): l += len(block) s += block['f0'].sum() self.assert_(l == N - 1) self.assert_(s == (N - 1) * (N / 2)) # Gauss summation formula
def test00(self): """Testing `wherechunks` method with only an expression""" N = int(1e4) ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2'): l += len(block) s += block['f0'].sum() self.assert_(l == N - 1) self.assert_(s == (N - 1) * (N / 2)) # Gauss summation formula
def test07(self): """Testing `wherechunks` method with a `limit`, `skip` parameter""" N, M = int(1e4), 101 ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2', limit=N - M - 2, skip=M): l += len(block) s += block['f0'].sum() self.assert_(l == N - M - 2) self.assert_(s == np.arange(M + 1, N - 1).sum())
def test03(self): """Testing `wherechunks` method with a `outfields` with 1 field""" N = int(1e4) ra = np.fromiter(((i, i, i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2', outfields=('f1', )): self.assert_(block.dtype.names == ('f1', )) l += len(block) s += block['f1'].sum() self.assert_(l == N - 1) self.assert_(s == (N - 1) * (N / 2)) # Gauss summation formula
def test02(self): """Testing `wherechunks` method with a `outfields` with 2 fields""" N = int(1e4) ra = np.fromiter(((i, i, i*3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2', outfields=('f1','f2')): self.assert_(block.dtype.names == ('f1','f2')) l += len(block) s += block['f1'].sum() self.assert_(l == N - 1) self.assert_(s == (N - 1) * (N / 2)) # Gauss summation formula
def test05(self): """Testing `wherechunks` method with a `limit` parameter""" N, M = int(1e4), 101 ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f1 < f2', limit=M): l += len(block) s += block['f0'].sum() self.assert_(l == M) self.assert_(s == M * ((M + 1) / 2)) # Gauss summation formula
def modelPredictor(modelsPath_modelIndex_dataPath_colNames_tuple): """ Input: A tuple, with following two attributes (with order): modelsPath: string, the path to the trained models. (pickle file) modelIndex: integer, the index of the model to predict. dataPath: string, the path to the data. colNames: a list of strings, column names of the output table. It should be like ["Id", "V1", ...] Output: A btable, consists of Id column, Predicted column and the data. Notes: modelPredictor will create following directories for you if they do not exist. 1. Model_No{modelIndex}_predicted_array: it will be under the dataPath. """ # Set up necessary constance. divideN = 300000 modelsPath, modelIndex, dataPath, colNames = modelsPath_modelIndex_dataPath_colNames_tuple def data_abspath(colname): return os.path.abspath(os.path.join(dataPath, colname)) with open(modelsPath, "rb") as rf: models = pickle.load(rf) model = models[modelIndex] del models # Read in data with btable. Id = blz.open(os.path.join(dataPath, colNames[0])) totalN = len(Id) if totalN % divideN == 0: nodes_list = [i * divideN for i in range(totalN / divideN + 1)] else: nodes_list = [i * divideN for i in range(totalN / divideN + 1)] + [totalN] nodes_pair_list = zip(nodes_list[:-1], nodes_list[1:]) # Prediction. y_predict = np.zeros(totalN) print "[Model No.{modelIndex}] Prediction process begins.".format(modelIndex = modelIndex) for begin, end in nodes_pair_list: print "[Model No.{modelIndex}] Processing {begin} ~ {end} observations.".format(modelIndex=modelIndex, begin = begin + 1, end = end) columns = [blz.open(os.path.join(dataPath, colname))[begin:end] for colname in colNames[1:]] X = np.column_stack(columns) temp = model.predict(X) y_predict[begin:end] = temp columns = [Id, blz.barray(y_predict)] data_rootdir = os.path.join(dataPath, "Model_No{modelIndex}_predicted_array".format(modelIndex = modelIndex)) if data_rootdir in os.listdir(dataPath): print "Removing Old result_table directory for new btable." command = "rm -rf " + data_rootdir os.system(command) final_table = blz.btable(columns = columns, names = ["Id", "Predict"], rootdir = data_rootdir) print "The result_table btable rootdir is under {path}".format(path=data_rootdir)
def create_kaggle_submit_csv(self, submit_format="%d,%.6f"): assert "predict_proba" in self.list_all_predictions["testing"] prediction_prob = self.load_prediction_blz(datatype="testing", valuetype="predict_proba")[:,1] ids_barray = blz.open(os.path.join(tools.TESTING_BLZ_PATH,TESTING_COLUMN_NAMES[0])) bt = blz.btable(columns=[ids_barray,prediction_prob], names=["Id","Predicted"]) all_results = [submit_format % tuple(xx) for xx in bt.iter()] all_results_string = "\n".join([",".join(bt.names)] + all_results) submit_filename = "%s_%s.csv" % (self.model_id, datetime.datetime.now().strftime("%Y%m%d%H%M%S")) submit_filepath = os.path.join(SUBMITS_PATH,submit_filename) with open(submit_filepath,"w") as wf: wf.write(all_results_string)
def test01(self): """Testing `wherechunks` method with a `blen`""" N = int(1e4) ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f0 <= f1', blen=100): l += len(block) # All blocks should be of length 100, except the last one, # which should be 0 self.assert_(len(block) in (0, 100)) s += block['f0'].sum() self.assert_(l == N) self.assert_(s == (N - 1) * (N / 2)) # Gauss summation formula
def test01(self): """Testing `wherechunks` method with a `blen`""" N = int(1e4) ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)), dtype='i4,f8,i8') t = blz.btable(ra) l, s = 0, 0 for block in blz.whereblocks(t, 'f0 <= f1', blen=100): l += len(block) # All blocks should be of length 100, except the last one, # which should be 0 self.assert_(len(block) in (0, 100)) s += block['f0'].sum() self.assert_(l == N) self.assert_(s == (N - 1) * (N / 2)) # Gauss summation formula
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC): """Group the `val` field in `sreader` stream of lines by `key` index. Parameters ---------- sreader : iterator Iterator over a stream of CSV lines. key : string The name of the field to be grouped by. val : string The field name with the values that have to be grouped. dtype : dynd dtype The DyND data type with all the fields of the CSV lines, including the `key` and `val` names. path : string The path of the file where the BLZ array with the final grouping will be stored. If None (default), the BLZ will be stored in-memory (and hence non-persistent). lines_per_chunk : int The number of chunks that have to be read to be grouped by in-memory. For optimal perfomance, some experimentation should be needed. The default value should work reasonably well, though. Returns ------- output : BLZ table Returns a BLZ table with column names that are the groups resulting from the groupby operation. The columns are filled with the `val` field of the lines delivered by `sreader`. """ try: nptype = get_nptype(dtype, val) except ValueError: raise ValueError("`val` should be a valid field") # Start reading chunks prev_keys = set() while True: ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype) if len(ndbuf) == 0: break # CSV data exhausted # Do the groupby for this chunk keys = getattr(ndbuf, key) if val is None: vals = ndbuf else: vals = getattr(ndbuf, val) sby = nd.groupby(vals, keys) lkeys = nd.as_py(sby.groups) skeys = set(lkeys) # BLZ does not understand dynd objects (yet) sby = nd.as_py(sby.eval()) if len(prev_keys) == 0: # Add the initial keys to a BLZ table columns = [np.array(sby[i], nptype) for i in range(len(lkeys))] ssby = blz.btable(columns=columns, names=lkeys, rootdir=path, mode='w') else: # Have we new keys? new_keys = skeys.difference(prev_keys) for new_key in new_keys: # Get the index of the new key idx = lkeys.index(new_key) # and add the values as a new columns ssby.addcol(sby[idx], new_key, dtype=nptype) # Now fill the pre-existing keys existing_keys = skeys.intersection(prev_keys) for existing_key in existing_keys: # Get the index of the existing key idx = lkeys.index(existing_key) # and append the values here ssby[existing_key].append(sby[idx]) # Add the new keys to the existing ones prev_keys |= skeys # Before returning, flush all data into disk if path is not None: ssby.flush() return ssby
N = 1e7 # the number of elements in x M = 100000 # the elements to get clevel = 1 # the compression level print "Creating inputs with %d elements..." % N bparams = blz.bparams(clevel) #x = np.arange(N) x = np.zeros(N, dtype="f8") y = x.copy() z = x.copy() cx = blz.barray(x, bparams=bparams) cy = cx.copy() cz = cx.copy() ct = blz.btable((cx, cy, cz), names=['x','y','z']) t = ct[:] print "Starting benchmark now for getting %d elements..." % M # Retrieve from a ndarray t0 = time() vals = [x[i] for i in xrange(0, M, 3)] print "Time for array--> %.3f" % (time()-t0,) print "vals-->", len(vals) #blz.set_num_threads(blz.ncores//2) # Retrieve from a barray t0 = time() cvals = [cx[i] for i in xrange(0, M, 3)] #cvals = cx[:M:3][:].tolist()
N = 1e7 # the number of elements in x M = 100000 # the elements to get clevel = 1 # the compression level print "Creating inputs with %d elements..." % N bparams = blz.bparams(clevel) #x = np.arange(N) x = np.zeros(N, dtype="f8") y = x.copy() z = x.copy() cx = blz.barray(x, bparams=bparams) cy = cx.copy() cz = cx.copy() ct = blz.btable((cx, cy, cz), names=['x', 'y', 'z']) t = ct[:] print "Starting benchmark now for getting %d elements..." % M # Retrieve from a ndarray t0 = time() vals = [x[i] for i in xrange(0, M, 3)] print "Time for array--> %.3f" % (time() - t0, ) print "vals-->", len(vals) #blz.set_num_threads(blz.ncores//2) # Retrieve from a barray t0 = time() cvals = [cx[i] for i in xrange(0, M, 3)] #cvals = cx[:M:3][:].tolist()
def array(obj, dshape=None, ddesc=None): """Create a Blaze array. Parameters ---------- obj : array_like Initial contents for the array. dshape : datashape The datashape for the resulting array. By default the datashape will be inferred from data. If an explicit dshape is provided, the input data will be coerced into the provided dshape. ddesc : data descriptor instance This comes with the necessary info for storing the data. If None, a DyND_DDesc will be used. Returns ------- out : a concrete blaze array. """ dshape = _normalize_dshape(dshape) if ((obj is not None) and (not inspect.isgenerator(obj)) and (dshape is not None)): dt = ndt.type(str(dshape)) if dt.ndim > 0: obj = nd.array(obj, type=dt, access='rw') else: obj = nd.array(obj, dtype=dt, access='rw') if obj is None and ddesc is None: raise ValueError('you need to specify at least `obj` or `ddesc`') if isinstance(obj, Array): return obj elif isinstance(obj, DDesc): if ddesc is None: ddesc = obj return Array(ddesc) else: raise ValueError(('you cannot specify `ddesc` when `obj` ' 'is already a DDesc instance')) if ddesc is None: # Use a dynd ddesc by default try: array = nd.asarray(obj, access='rw') except: raise ValueError(('failed to construct a dynd array from ' 'object %r') % obj) ddesc = DyND_DDesc(array) return Array(ddesc) # The DDesc has been specified if isinstance(ddesc, DyND_DDesc): if obj is not None: raise ValueError(('you cannot specify simultaneously ' '`obj` and a DyND `ddesc`')) return Array(ddesc) elif isinstance(ddesc, BLZ_DDesc): if inspect.isgenerator(obj): dt = None if dshape is None else to_numpy_dtype(dshape) # TODO: Generator logic could go inside barray ddesc.blzarr = blz.fromiter(obj, dtype=dt, count=-1, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) else: if isinstance(obj, nd.array): obj = nd.as_numpy(obj) if dshape and isinstance(dshape.measure, datashape.Record): ddesc.blzarr = blz.btable(obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) else: ddesc.blzarr = blz.barray(obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) elif isinstance(ddesc, HDF5_DDesc): if isinstance(obj, nd.array): obj = nd.as_numpy(obj) with tb.open_file(ddesc.path, mode=ddesc.mode) as f: where, name = split_path(ddesc.datapath) if dshape and isinstance(dshape.measure, datashape.Record): # Convert the structured array to unaligned dtype # We need that because PyTables only accepts unaligned types, # which are the default in NumPy obj = np.array(obj, datashape.to_numpy_dtype(dshape.measure)) f.create_table(where, name, filters=ddesc.filters, obj=obj) else: f.create_earray(where, name, filters=ddesc.filters, obj=obj) ddesc.mode = 'a' # change into 'a'ppend mode for further operations return Array(ddesc)
def array(obj, dshape=None, ddesc=None): """Create a Blaze array. Parameters ---------- obj : array_like Initial contents for the array. dshape : datashape The datashape for the resulting array. By default the datashape will be inferred from data. If an explicit dshape is provided, the input data will be coerced into the provided dshape. ddesc : data descriptor instance This comes with the necessary info for storing the data. If None, a DyND_DDesc will be used. Returns ------- out : a concrete blaze array. """ dshape = _normalize_dshape(dshape) if ((obj is not None) and (not inspect.isgenerator(obj)) and (dshape is not None)): dt = ndt.type(str(dshape)) if dt.ndim > 0: obj = nd.array(obj, type=dt, access='rw') else: obj = nd.array(obj, dtype=dt, access='rw') if obj is None and ddesc is None: raise ValueError('you need to specify at least `obj` or `ddesc`') if isinstance(obj, Array): return obj elif isinstance(obj, DDesc): if ddesc is None: ddesc = obj return Array(ddesc) else: raise ValueError(('you cannot specify `ddesc` when `obj` ' 'is already a DDesc instance')) if ddesc is None: # Use a dynd ddesc by default try: array = nd.asarray(obj, access='rw') except: raise ValueError(('failed to construct a dynd array from ' 'object %r') % obj) ddesc = DyND_DDesc(array) return Array(ddesc) # The DDesc has been specified if isinstance(ddesc, DyND_DDesc): if obj is not None: raise ValueError(('you cannot specify simultaneously ' '`obj` and a DyND `ddesc`')) return Array(ddesc) elif isinstance(ddesc, BLZ_DDesc): if inspect.isgenerator(obj): dt = None if dshape is None else to_numpy_dtype(dshape) # TODO: Generator logic could go inside barray ddesc.blzarr = blz.fromiter(obj, dtype=dt, count=-1, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) else: if isinstance(obj, nd.array): obj = nd.as_numpy(obj) if dshape and isinstance(dshape.measure, datashape.Record): ddesc.blzarr = blz.btable( obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) else: ddesc.blzarr = blz.barray( obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) elif isinstance(ddesc, HDF5_DDesc): if isinstance(obj, nd.array): obj = nd.as_numpy(obj) with tb.open_file(ddesc.path, mode=ddesc.mode) as f: where, name = split_path(ddesc.datapath) if dshape and isinstance(dshape.measure, datashape.Record): # Convert the structured array to unaligned dtype # We need that because PyTables only accepts unaligned types, # which are the default in NumPy obj = np.array(obj, datashape.to_numpy_dtype(dshape.measure)) f.create_table(where, name, filters=ddesc.filters, obj=obj) else: f.create_earray(where, name, filters=ddesc.filters, obj=obj) ddesc.mode = 'a' # change into 'a'ppend mode for further operations return Array(ddesc)