def blz_llvec_fun(act, pred, output_vector_type ="numpy"):
    epsilon = 1e-15
    pred[pred<epsilon] = epsilon
    pred[pred > (1-epsilon) ] = 1-epsilon
    bt = blz.btable([act,pred],names=["y","py"])
    if output_vector_type =="numpy":
        return bt.eval("-y*log(py) - (1-y)*log(1-py)",out_flavor="numpy")
    else:
        return bt.eval("-y*log(py) - (1-y)*log(1-py)")
Exemple #2
0
 def test07(self):
     """Testing `wherechunks` method with a `limit`, `skip` parameter"""
     N, M = int(1e4), 101
     ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2', limit=N-M-2, skip=M):
         l += len(block)
         s += block['f0'].sum()
     self.assert_(l == N - M - 2)
     self.assert_(s == np.arange(M+1, N-1).sum())
Exemple #3
0
 def test05(self):
     """Testing `wherechunks` method with a `limit` parameter"""
     N, M = int(1e4), 101
     ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2', limit=M):
         l += len(block)
         s += block['f0'].sum()
     self.assert_(l == M)
     self.assert_(s == M * ((M + 1) / 2))  # Gauss summation formula
Exemple #4
0
 def test00(self):
     """Testing `wherechunks` method with only an expression"""
     N = int(1e4)
     ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2'):
         l += len(block)
         s += block['f0'].sum()
     self.assert_(l == N - 1)
     self.assert_(s == (N - 1) * (N / 2))  # Gauss summation formula
Exemple #5
0
 def test00(self):
     """Testing `wherechunks` method with only an expression"""
     N = int(1e4)
     ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)),
                      dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2'):
         l += len(block)
         s += block['f0'].sum()
     self.assert_(l == N - 1)
     self.assert_(s == (N - 1) * (N / 2))  # Gauss summation formula
Exemple #6
0
 def test07(self):
     """Testing `wherechunks` method with a `limit`, `skip` parameter"""
     N, M = int(1e4), 101
     ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)),
                      dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2', limit=N - M - 2, skip=M):
         l += len(block)
         s += block['f0'].sum()
     self.assert_(l == N - M - 2)
     self.assert_(s == np.arange(M + 1, N - 1).sum())
Exemple #7
0
 def test03(self):
     """Testing `wherechunks` method with a `outfields` with 1 field"""
     N = int(1e4)
     ra = np.fromiter(((i, i, i * 3) for i in xrange(N)), dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2', outfields=('f1', )):
         self.assert_(block.dtype.names == ('f1', ))
         l += len(block)
         s += block['f1'].sum()
     self.assert_(l == N - 1)
     self.assert_(s == (N - 1) * (N / 2))  # Gauss summation formula
Exemple #8
0
 def test02(self):
     """Testing `wherechunks` method with a `outfields` with 2 fields"""
     N = int(1e4)
     ra = np.fromiter(((i, i, i*3) for i in xrange(N)), dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2', outfields=('f1','f2')):
         self.assert_(block.dtype.names == ('f1','f2'))
         l += len(block)
         s += block['f1'].sum()
     self.assert_(l == N - 1)
     self.assert_(s == (N - 1) * (N / 2))  # Gauss summation formula
Exemple #9
0
 def test05(self):
     """Testing `wherechunks` method with a `limit` parameter"""
     N, M = int(1e4), 101
     ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)),
                      dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f1 < f2', limit=M):
         l += len(block)
         s += block['f0'].sum()
     self.assert_(l == M)
     self.assert_(s == M * ((M + 1) / 2))  # Gauss summation formula
Exemple #10
0
def modelPredictor(modelsPath_modelIndex_dataPath_colNames_tuple):
    """
    Input: A tuple, with following two attributes (with order):
            modelsPath: string, the path to the trained models. (pickle file)
            modelIndex: integer, the index of the model to predict.
            dataPath: string, the path to the data.
            colNames: a list of strings, column names of the output table. It should be like ["Id", "V1", ...]
    Output: A btable, consists of Id column, Predicted column and the data.
    
    Notes:
    modelPredictor will create following directories for you if they do not exist.
            1. Model_No{modelIndex}_predicted_array: it will be under the dataPath.
    """
    # Set up necessary constance.
    divideN = 300000
    modelsPath, modelIndex, dataPath, colNames = modelsPath_modelIndex_dataPath_colNames_tuple
    def data_abspath(colname):
        return os.path.abspath(os.path.join(dataPath, colname))
    with open(modelsPath, "rb") as rf:
        models = pickle.load(rf)
    model = models[modelIndex]
    del models
    
    # Read in data with btable.
    Id = blz.open(os.path.join(dataPath, colNames[0]))
    totalN = len(Id)
    if totalN % divideN == 0:
        nodes_list = [i * divideN for i in range(totalN / divideN + 1)]
    else:
        nodes_list = [i * divideN for i in range(totalN / divideN + 1)] + [totalN]
    nodes_pair_list = zip(nodes_list[:-1], nodes_list[1:])
    
    # Prediction.
    y_predict = np.zeros(totalN)
    print "[Model No.{modelIndex}] Prediction process begins.".format(modelIndex = modelIndex)
    for begin, end in nodes_pair_list:
        print "[Model No.{modelIndex}] Processing {begin} ~ {end} observations.".format(modelIndex=modelIndex, begin = begin + 1, end = end)
        columns = [blz.open(os.path.join(dataPath, colname))[begin:end] for colname in colNames[1:]]
        X = np.column_stack(columns)
        temp = model.predict(X)
        y_predict[begin:end] = temp
    
    columns = [Id, blz.barray(y_predict)]
    data_rootdir = os.path.join(dataPath, "Model_No{modelIndex}_predicted_array".format(modelIndex = modelIndex))
    if data_rootdir in os.listdir(dataPath):
        print "Removing Old result_table directory for new btable."
        command = "rm -rf " + data_rootdir
        os.system(command)
    final_table = blz.btable(columns = columns, names = ["Id", "Predict"], rootdir = data_rootdir)
    print "The result_table btable rootdir is under {path}".format(path=data_rootdir)
 def create_kaggle_submit_csv(self, submit_format="%d,%.6f"):
     assert "predict_proba" in self.list_all_predictions["testing"]
     prediction_prob = self.load_prediction_blz(datatype="testing", valuetype="predict_proba")[:,1]
     ids_barray = blz.open(os.path.join(tools.TESTING_BLZ_PATH,TESTING_COLUMN_NAMES[0]))
     bt = blz.btable(columns=[ids_barray,prediction_prob], names=["Id","Predicted"])
     all_results = [submit_format % tuple(xx) for xx in bt.iter()]
     all_results_string = "\n".join([",".join(bt.names)] + all_results)
     
     submit_filename = "%s_%s.csv" % (self.model_id, datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
     
     submit_filepath = os.path.join(SUBMITS_PATH,submit_filename)
     
     with open(submit_filepath,"w") as wf:
         wf.write(all_results_string)
Exemple #12
0
 def test01(self):
     """Testing `wherechunks` method with a `blen`"""
     N = int(1e4)
     ra = np.fromiter(((i, i*2., i*3) for i in xrange(N)), dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f0 <= f1', blen=100):
         l += len(block)
         # All blocks should be of length 100, except the last one,
         # which should be 0
         self.assert_(len(block) in (0, 100))
         s += block['f0'].sum()
     self.assert_(l == N)
     self.assert_(s == (N - 1) * (N / 2))  # Gauss summation formula
Exemple #13
0
 def test01(self):
     """Testing `wherechunks` method with a `blen`"""
     N = int(1e4)
     ra = np.fromiter(((i, i * 2., i * 3) for i in xrange(N)),
                      dtype='i4,f8,i8')
     t = blz.btable(ra)
     l, s = 0, 0
     for block in blz.whereblocks(t, 'f0 <= f1', blen=100):
         l += len(block)
         # All blocks should be of length 100, except the last one,
         # which should be 0
         self.assert_(len(block) in (0, 100))
         s += block['f0'].sum()
     self.assert_(l == N)
     self.assert_(s == (N - 1) * (N / 2))  # Gauss summation formula
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC):
    """Group the `val` field in `sreader` stream of lines by `key` index.

    Parameters
    ----------
    sreader : iterator
        Iterator over a stream of CSV lines.
    key : string
        The name of the field to be grouped by.
    val : string
        The field name with the values that have to be grouped.
    dtype : dynd dtype
        The DyND data type with all the fields of the CSV lines,
        including the `key` and `val` names.
    path : string
        The path of the file where the BLZ array with the final
        grouping will be stored.  If None (default), the BLZ will be
        stored in-memory (and hence non-persistent).
    lines_per_chunk : int
        The number of chunks that have to be read to be grouped by
        in-memory.  For optimal perfomance, some experimentation
        should be needed.  The default value should work reasonably
        well, though.

    Returns
    -------
    output : BLZ table
        Returns a BLZ table with column names that are the groups
        resulting from the groupby operation.  The columns are filled
        with the `val` field of the lines delivered by `sreader`.

    """

    try:
        nptype = get_nptype(dtype, val)
    except ValueError:
        raise ValueError("`val` should be a valid field")

    # Start reading chunks
    prev_keys = set()
    while True:
        ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype)
        if len(ndbuf) == 0: break   # CSV data exhausted

        # Do the groupby for this chunk
        keys = getattr(ndbuf, key)
        if val is None:
            vals = ndbuf
        else:
            vals = getattr(ndbuf, val)
        sby = nd.groupby(vals, keys)
        lkeys = nd.as_py(sby.groups)
        skeys = set(lkeys)
        # BLZ does not understand dynd objects (yet)
        sby = nd.as_py(sby.eval())

        if len(prev_keys) == 0:
            # Add the initial keys to a BLZ table
            columns = [np.array(sby[i], nptype) for i in range(len(lkeys))]
            ssby = blz.btable(columns=columns, names=lkeys, rootdir=path,
                              mode='w')
        else:
            # Have we new keys?
            new_keys = skeys.difference(prev_keys)
            for new_key in new_keys:
                # Get the index of the new key
                idx = lkeys.index(new_key)
                # and add the values as a new columns
                ssby.addcol(sby[idx], new_key, dtype=nptype)
            # Now fill the pre-existing keys
            existing_keys = skeys.intersection(prev_keys)
            for existing_key in existing_keys:
                # Get the index of the existing key
                idx = lkeys.index(existing_key)
                # and append the values here
                ssby[existing_key].append(sby[idx])

        # Add the new keys to the existing ones
        prev_keys |= skeys

    # Before returning, flush all data into disk
    if path is not None:
        ssby.flush()
    return ssby
Exemple #15
0
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC):
    """Group the `val` field in `sreader` stream of lines by `key` index.

    Parameters
    ----------
    sreader : iterator
        Iterator over a stream of CSV lines.
    key : string
        The name of the field to be grouped by.
    val : string
        The field name with the values that have to be grouped.
    dtype : dynd dtype
        The DyND data type with all the fields of the CSV lines,
        including the `key` and `val` names.
    path : string
        The path of the file where the BLZ array with the final
        grouping will be stored.  If None (default), the BLZ will be
        stored in-memory (and hence non-persistent).
    lines_per_chunk : int
        The number of chunks that have to be read to be grouped by
        in-memory.  For optimal perfomance, some experimentation
        should be needed.  The default value should work reasonably
        well, though.

    Returns
    -------
    output : BLZ table
        Returns a BLZ table with column names that are the groups
        resulting from the groupby operation.  The columns are filled
        with the `val` field of the lines delivered by `sreader`.

    """

    try:
        nptype = get_nptype(dtype, val)
    except ValueError:
        raise ValueError("`val` should be a valid field")

    # Start reading chunks
    prev_keys = set()
    while True:
        ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype)
        if len(ndbuf) == 0: break  # CSV data exhausted

        # Do the groupby for this chunk
        keys = getattr(ndbuf, key)
        if val is None:
            vals = ndbuf
        else:
            vals = getattr(ndbuf, val)
        sby = nd.groupby(vals, keys)
        lkeys = nd.as_py(sby.groups)
        skeys = set(lkeys)
        # BLZ does not understand dynd objects (yet)
        sby = nd.as_py(sby.eval())

        if len(prev_keys) == 0:
            # Add the initial keys to a BLZ table
            columns = [np.array(sby[i], nptype) for i in range(len(lkeys))]
            ssby = blz.btable(columns=columns,
                              names=lkeys,
                              rootdir=path,
                              mode='w')
        else:
            # Have we new keys?
            new_keys = skeys.difference(prev_keys)
            for new_key in new_keys:
                # Get the index of the new key
                idx = lkeys.index(new_key)
                # and add the values as a new columns
                ssby.addcol(sby[idx], new_key, dtype=nptype)
            # Now fill the pre-existing keys
            existing_keys = skeys.intersection(prev_keys)
            for existing_key in existing_keys:
                # Get the index of the existing key
                idx = lkeys.index(existing_key)
                # and append the values here
                ssby[existing_key].append(sby[idx])

        # Add the new keys to the existing ones
        prev_keys |= skeys

    # Before returning, flush all data into disk
    if path is not None:
        ssby.flush()
    return ssby
Exemple #16
0
N = 1e7       # the number of elements in x
M = 100000    # the elements to get
clevel = 1    # the compression level

print "Creating inputs with %d elements..." % N

bparams = blz.bparams(clevel)

#x = np.arange(N)
x = np.zeros(N, dtype="f8")
y = x.copy()
z = x.copy()
cx = blz.barray(x, bparams=bparams)
cy = cx.copy()
cz = cx.copy()
ct = blz.btable((cx, cy, cz), names=['x','y','z'])
t = ct[:]

print "Starting benchmark now for getting %d elements..." % M
# Retrieve from a ndarray
t0 = time()
vals = [x[i] for i in xrange(0, M, 3)]
print "Time for array--> %.3f" % (time()-t0,)
print "vals-->", len(vals)

#blz.set_num_threads(blz.ncores//2)

# Retrieve from a barray
t0 = time()
cvals = [cx[i] for i in xrange(0, M, 3)]
#cvals = cx[:M:3][:].tolist()
Exemple #17
0
N = 1e7  # the number of elements in x
M = 100000  # the elements to get
clevel = 1  # the compression level

print "Creating inputs with %d elements..." % N

bparams = blz.bparams(clevel)

#x = np.arange(N)
x = np.zeros(N, dtype="f8")
y = x.copy()
z = x.copy()
cx = blz.barray(x, bparams=bparams)
cy = cx.copy()
cz = cx.copy()
ct = blz.btable((cx, cy, cz), names=['x', 'y', 'z'])
t = ct[:]

print "Starting benchmark now for getting %d elements..." % M
# Retrieve from a ndarray
t0 = time()
vals = [x[i] for i in xrange(0, M, 3)]
print "Time for array--> %.3f" % (time() - t0, )
print "vals-->", len(vals)

#blz.set_num_threads(blz.ncores//2)

# Retrieve from a barray
t0 = time()
cvals = [cx[i] for i in xrange(0, M, 3)]
#cvals = cx[:M:3][:].tolist()
def array(obj, dshape=None, ddesc=None):
    """Create a Blaze array.

    Parameters
    ----------
    obj : array_like
        Initial contents for the array.

    dshape : datashape
        The datashape for the resulting array. By default the
        datashape will be inferred from data. If an explicit dshape is
        provided, the input data will be coerced into the provided
        dshape.

    ddesc : data descriptor instance
        This comes with the necessary info for storing the data.  If
        None, a DyND_DDesc will be used.

    Returns
    -------
    out : a concrete blaze array.

    """
    dshape = _normalize_dshape(dshape)

    if ((obj is not None) and (not inspect.isgenerator(obj))
            and (dshape is not None)):
        dt = ndt.type(str(dshape))
        if dt.ndim > 0:
            obj = nd.array(obj, type=dt, access='rw')
        else:
            obj = nd.array(obj, dtype=dt, access='rw')

    if obj is None and ddesc is None:
        raise ValueError('you need to specify at least `obj` or `ddesc`')

    if isinstance(obj, Array):
        return obj
    elif isinstance(obj, DDesc):
        if ddesc is None:
            ddesc = obj
            return Array(ddesc)
        else:
            raise ValueError(('you cannot specify `ddesc` when `obj` '
                              'is already a DDesc instance'))

    if ddesc is None:
        # Use a dynd ddesc by default
        try:
            array = nd.asarray(obj, access='rw')
        except:
            raise ValueError(('failed to construct a dynd array from '
                              'object %r') % obj)
        ddesc = DyND_DDesc(array)
        return Array(ddesc)

    # The DDesc has been specified
    if isinstance(ddesc, DyND_DDesc):
        if obj is not None:
            raise ValueError(('you cannot specify simultaneously '
                              '`obj` and a DyND `ddesc`'))
        return Array(ddesc)
    elif isinstance(ddesc, BLZ_DDesc):
        if inspect.isgenerator(obj):
            dt = None if dshape is None else to_numpy_dtype(dshape)
            # TODO: Generator logic could go inside barray
            ddesc.blzarr = blz.fromiter(obj,
                                        dtype=dt,
                                        count=-1,
                                        rootdir=ddesc.path,
                                        mode=ddesc.mode,
                                        **ddesc.kwargs)
        else:
            if isinstance(obj, nd.array):
                obj = nd.as_numpy(obj)
            if dshape and isinstance(dshape.measure, datashape.Record):
                ddesc.blzarr = blz.btable(obj,
                                          rootdir=ddesc.path,
                                          mode=ddesc.mode,
                                          **ddesc.kwargs)
            else:
                ddesc.blzarr = blz.barray(obj,
                                          rootdir=ddesc.path,
                                          mode=ddesc.mode,
                                          **ddesc.kwargs)
    elif isinstance(ddesc, HDF5_DDesc):
        if isinstance(obj, nd.array):
            obj = nd.as_numpy(obj)
        with tb.open_file(ddesc.path, mode=ddesc.mode) as f:
            where, name = split_path(ddesc.datapath)
            if dshape and isinstance(dshape.measure, datashape.Record):
                # Convert the structured array to unaligned dtype
                # We need that because PyTables only accepts unaligned types,
                # which are the default in NumPy
                obj = np.array(obj, datashape.to_numpy_dtype(dshape.measure))
                f.create_table(where, name, filters=ddesc.filters, obj=obj)
            else:
                f.create_earray(where, name, filters=ddesc.filters, obj=obj)
        ddesc.mode = 'a'  # change into 'a'ppend mode for further operations

    return Array(ddesc)
def array(obj, dshape=None, ddesc=None):
    """Create a Blaze array.

    Parameters
    ----------
    obj : array_like
        Initial contents for the array.

    dshape : datashape
        The datashape for the resulting array. By default the
        datashape will be inferred from data. If an explicit dshape is
        provided, the input data will be coerced into the provided
        dshape.

    ddesc : data descriptor instance
        This comes with the necessary info for storing the data.  If
        None, a DyND_DDesc will be used.

    Returns
    -------
    out : a concrete blaze array.

    """
    dshape = _normalize_dshape(dshape)

    if ((obj is not None) and
        (not inspect.isgenerator(obj)) and
        (dshape is not None)):
        dt = ndt.type(str(dshape))
        if dt.ndim > 0:
            obj = nd.array(obj, type=dt, access='rw')
        else:
            obj = nd.array(obj, dtype=dt, access='rw')

    if obj is None and ddesc is None:
        raise ValueError('you need to specify at least `obj` or `ddesc`')

    if isinstance(obj, Array):
        return obj
    elif isinstance(obj, DDesc):
        if ddesc is None:
            ddesc = obj
            return Array(ddesc)
        else:
            raise ValueError(('you cannot specify `ddesc` when `obj` '
                              'is already a DDesc instance'))

    if ddesc is None:
        # Use a dynd ddesc by default
        try:
            array = nd.asarray(obj, access='rw')
        except:
            raise ValueError(('failed to construct a dynd array from '
                              'object %r') % obj)
        ddesc = DyND_DDesc(array)
        return Array(ddesc)

    # The DDesc has been specified
    if isinstance(ddesc, DyND_DDesc):
        if obj is not None:
            raise ValueError(('you cannot specify simultaneously '
                              '`obj` and a DyND `ddesc`'))
        return Array(ddesc)
    elif isinstance(ddesc, BLZ_DDesc):
        if inspect.isgenerator(obj):
            dt = None if dshape is None else to_numpy_dtype(dshape)
            # TODO: Generator logic could go inside barray
            ddesc.blzarr = blz.fromiter(obj, dtype=dt, count=-1,
                                        rootdir=ddesc.path, mode=ddesc.mode,
                                        **ddesc.kwargs)
        else:
            if isinstance(obj, nd.array):
                obj = nd.as_numpy(obj)
            if dshape and isinstance(dshape.measure, datashape.Record):
                ddesc.blzarr = blz.btable(
                    obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs)
            else:
                ddesc.blzarr = blz.barray(
                    obj, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs)
    elif isinstance(ddesc, HDF5_DDesc):
        if isinstance(obj, nd.array):
            obj = nd.as_numpy(obj)
        with tb.open_file(ddesc.path, mode=ddesc.mode) as f:
            where, name = split_path(ddesc.datapath)
            if dshape and isinstance(dshape.measure, datashape.Record):
                # Convert the structured array to unaligned dtype
                # We need that because PyTables only accepts unaligned types,
                # which are the default in NumPy
                obj = np.array(obj, datashape.to_numpy_dtype(dshape.measure))
                f.create_table(where, name, filters=ddesc.filters, obj=obj)
            else:
                f.create_earray(where, name, filters=ddesc.filters, obj=obj)
        ddesc.mode = 'a'  # change into 'a'ppend mode for further operations

    return Array(ddesc)