Ejemplo n.º 1
0
def tables(docompute, dowrite, complib, verbose):

    # Filenames
    ifilename = os.path.join(OUT_DIR, "expression-inputs.h5")
    ofilename = os.path.join(OUT_DIR, "expression-outputs.h5")

    # Filters
    shuffle = True
    if complib == 'blosc':
        filters = tb.Filters(complevel=1, complib='blosc', shuffle=shuffle)
    elif complib == 'lzo':
        filters = tb.Filters(complevel=1, complib='lzo', shuffle=shuffle)
    elif complib == 'zlib':
        filters = tb.Filters(complevel=1, complib='zlib', shuffle=shuffle)
    else:
        filters = tb.Filters(complevel=0, shuffle=False)
    if verbose:
        print("Will use filters:", filters)

    if dowrite:
        f = tb.open_file(ifilename, 'w')

        # Build input arrays
        t0 = time()
        root = f.root
        a = f.create_carray(root, 'a', tb.Float32Atom(),
                            shape, filters=filters)
        b = f.create_carray(root, 'b', tb.Float32Atom(),
                            shape, filters=filters)
        if verbose:
            print("chunkshape:", a.chunkshape)
            print("chunksize:", np.prod(a.chunkshape) * a.dtype.itemsize)
        #row = np.linspace(0, 1, ncols)
        row = np.arange(0, ncols, dtype='float32')
        for i in range(nrows):
            a[i] = row * (i + 1)
            b[i] = row * (i + 1) * 2
        f.close()
        print("[tables.Expr] Time for creating inputs:", round(time() - t0, 3))

    if docompute:
        f = tb.open_file(ifilename, 'r')
        fr = tb.open_file(ofilename, 'w')
        a = f.root.a
        b = f.root.b
        r1 = f.create_carray(fr.root, 'r1', tb.Float32Atom(), shape,
                             filters=filters)
        # The expression
        e = tb.Expr(expr)
        e.set_output(r1)
        t0 = time()
        e.eval()
        if verbose:
            print("First ten values:", r1[0, :10])
        f.close()
        fr.close()
        print("[tables.Expr] Time for computing & save:",
              round(time() - t0, 3))
Ejemplo n.º 2
0
def compute_up(expr, data, **kwargs):
    if len(expr._children) != 1:
        raise ValueError("Only one child in Broadcast allowed")

    s = expr._scalars[0]
    cols = [s[field] for field in s.fields]
    expr_str = print_numexpr(cols, expr._scalar_expr)
    uservars = dict((c, getattr(data.cols, c)) for c in s.fields)
    e = tb.Expr(expr_str, uservars=uservars, truediv=True)
    return e.eval()
Ejemplo n.º 3
0
def _filter_inds(col_dict, query):

    match_string, uservars = _build_search_string(query)
    # link user vars to existing columns in memory
    for name in list(uservars.keys()):
        uservars[name] = col_dict[uservars[name]]

    # run filtering and subselect columns
    inds = tb.Expr(match_string, uservars=uservars).eval()

    return inds
Ejemplo n.º 4
0
def multiply_table_column_by():
    path = "/skynet3_rech1/huziy/hdf_store/quebec_0.1_crcm5-r_spinup.hdf"
    var_name = "AV"
    h = tb.open_file(path, mode="a")
    var_table = h.get_node("/", var_name)
    coef = 3 * 60 * 60  # output step
    expr = tb.Expr("c * m", uservars={"c": var_table.cols.field, "m": coef})
    column = var_table.cols.field
    expr.set_output(column)
    expr.eval()

    var_table.flush()
    h.close()
Ejemplo n.º 5
0
def find_a_model(z0, z1, h0, h1):
    """ Find a SAM galaxy within the range of redshift and h band magnitude of the hubble image """
    fir, d, p, h = readhdf5()
    z_sam = fir.root.data.col('z')
    h_sam = d.root.data.col('wfc3f160w')
    bc = tables.Expr(
        '(z_sam>%.3f) & (z_sam<%.3f) & (h_sam>%.3f) & (h_sam<%.3f)' %
        (z0, z1, h0, h1)).eval()
    igal = np.nonzero(bc)
    try:
        rr = np.random.randint(0, len(igal[0]))
    except ValueError:
        return -99, len(igal[0])
    return igal[0][rr], len(igal[0])
Ejemplo n.º 6
0
def compute_tables():
    """Compute the polynomial with tables.Expr."""
    f = tb.openFile(h5fname, "a")
    x = f.root.x  # get the x input
    # Create container for output
    atom = tb.Atom.from_dtype(dtype)
    filters = tb.Filters(complib=clib, complevel=clevel)
    r = f.createCArray(f.root, "r", atom=atom, shape=(N, ), filters=filters)

    # Do the actual computation and store in output
    ex = tb.Expr(expr)  # parse the expression
    ex.setOutput(r)  # where is stored the result?
    # when commented out, the result goes in-memory
    ex.eval()  # evaluate!

    f.close()
    print_filesize(h5fname)
    return N
Ejemplo n.º 7
0
 def get_expression_data(self, expression, table_loc=None, filename=None):
     import tables
     if table_loc is None:
         table_loc = self.data_path
     target_table = self.chest.get_node(table_loc)
     uv = target_table.colinstances
     # apply any shortcuts/macros
     expression = self.remap_distance_expressions(expression)
     # evaluate the math expression
     data = tables.Expr(expression, uv).eval()
     if filename is None:
         filename = self.get_active_name()
     elif filename == "all":
         return data
     # pick out the indices for only the active image
     indices = target_table.get_where_list(
         #'(omit==False) & (filename == "%s")' % self.get_active_name())
         '(filename == "%s")' % filename)
     # access the array data for those indices
     data = data[indices]
     return data
Ejemplo n.º 8
0
def process_file(kind, prec, clevel, synth):

    if kind == "numpy":
        lib = "none"
    else:
        lib = kind
    if synth:
        prefix = 'synth/synth-'
    else:
        prefix = 'cellzome/cellzome-'
    iname = '%s/%s-%s%d-%s.h5' % (dirname, prefix, kind, clevel, prec)
    f = tb.open_file(iname, "r")
    a_ = f.root.maxarea
    b_ = f.root.mascotscore

    oname = '%s/%s-%s%d-%s-r.h5' % (dirname, prefix, kind, clevel, prec)
    f2 = tb.open_file(oname, "w")
    if lib == "none":
        filters = None
    else:
        filters = tb.Filters(complib=lib, complevel=clevel, shuffle=shuffle)
    if prec == "single":
        type_ = tb.Float32Atom()
    else:
        type_ = tb.Float64Atom()
    r = f2.create_carray('/', 'r', type_, a_.shape, filters=filters)

    if kind == "numpy":
        a2, b2 = a_[:], b_[:]
        t0 = time()
        r = eval(expression, {'a': a2, 'b': b2})
        print "%5.2f" % round(time() - t0, 3)
    else:
        expr = tb.Expr(expression, {'a': a_, 'b': b_})
        expr.set_output(r)
        expr.eval()
    f.close()
    f2.close()
    size = float(os.stat(iname)[6]) + float(os.stat(oname)[6])
    return size
Ejemplo n.º 9
0
def MCLA(hdf5_file_name, cluster_runs, verbose=False, N_clusters_max=None):
    """Meta-CLustering Algorithm for a consensus function.
    
    Parameters
    ----------
    hdf5_file_name : file handle or string
    
    cluster_runs : array of shape (n_partitions, n_samples)
    
    verbose : bool, optional (default = False)
    
    N_clusters_max : int, optional (default = None)
    
    Returns
    -------
    A vector specifying the cluster label to which each sample has been assigned
    by the MCLA approximation algorithm for consensus clustering.

    Reference
    ---------
    A. Strehl and J. Ghosh, "Cluster Ensembles - A Knowledge Reuse Framework
    for Combining Multiple Partitions".
    In: Journal of Machine Learning Research, 3, pp. 583-617. 2002
    """

    print('\n*****')
    print('INFO: Cluster_Ensembles: MCLA: consensus clustering using MCLA.')

    if N_clusters_max == None:
        N_clusters_max = int(np.nanmax(cluster_runs)) + 1

    N_runs = cluster_runs.shape[0]
    N_samples = cluster_runs.shape[1]

    print(
        "INFO: Cluster_Ensembles: MCLA: preparing graph for meta-clustering.")

    hypergraph_adjacency = load_hypergraph_adjacency(hdf5_file_name)
    w = hypergraph_adjacency.sum(axis=1)

    N_rows = hypergraph_adjacency.shape[0]

    print(
        "INFO: Cluster_Ensembles: MCLA: done filling hypergraph adjacency matrix. "
        "Starting computation of Jaccard similarity matrix.")

    # Next, obtain a matrix of pairwise Jaccard similarity scores between the rows of the hypergraph adjacency matrix.
    with tables.open_file(hdf5_file_name, 'r+') as fileh:
        FILTERS = get_compression_filter(4 * (N_rows**2))

        similarities_MCLA = fileh.create_carray(fileh.root.consensus_group,
                                                'similarities_MCLA',
                                                tables.Float32Atom(),
                                                (N_rows, N_rows),
                                                "Matrix of pairwise Jaccard "
                                                "similarity scores",
                                                filters=FILTERS)

        scale_factor = 100.0

        print("INFO: Cluster_Ensembles: MCLA: "
              "starting computation of Jaccard similarity matrix.")

        squared_MCLA = hypergraph_adjacency.dot(
            hypergraph_adjacency.transpose())

        squared_sums = hypergraph_adjacency.sum(axis=1)
        squared_sums = np.squeeze(np.asarray(squared_sums))

        chunks_size = get_chunk_size(N_rows, 7)
        for i in range(0, N_rows, chunks_size):
            n_dim = min(chunks_size, N_rows - i)

            temp = squared_MCLA[i:min(i + chunks_size, N_rows), :].todense()
            temp = np.squeeze(np.asarray(temp))

            x = squared_sums[i:min(i + chunks_size, N_rows)]
            x = x.reshape(-1, 1)
            x = np.dot(x, np.ones((1, squared_sums.size)))

            y = np.dot(np.ones((n_dim, 1)), squared_sums.reshape(1, -1))

            temp = np.divide(temp, x + y - temp)
            temp *= scale_factor

            Jaccard_matrix = np.rint(temp)
            similarities_MCLA[i:min(i + chunks_size, N_rows)] = Jaccard_matrix

            del Jaccard_matrix, temp, x, y
            gc.collect()

    # Done computing the matrix of pairwise Jaccard similarity scores.
    print("INFO: Cluster_Ensembles: MCLA: done computing the matrix of "
          "pairwise Jaccard similarity scores.")

    cluster_labels = cmetis(hdf5_file_name, N_clusters_max, w)
    cluster_labels = one_to_max(cluster_labels)
    # After 'cmetis' returns, we are done with clustering hyper-edges

    # We are now ready to start the procedure meant to collapse meta-clusters.
    N_consensus = np.amax(cluster_labels) + 1

    fileh = tables.open_file(hdf5_file_name, 'r+')

    FILTERS = get_compression_filter(4 * N_consensus * N_samples)

    clb_cum = fileh.create_carray(
        fileh.root.consensus_group,
        'clb_cum',
        tables.Float32Atom(), (N_consensus, N_samples),
        'Matrix of mean memberships, forming meta-clusters',
        filters=FILTERS)

    chunks_size = get_chunk_size(N_samples, 7)
    for i in range(0, N_consensus, chunks_size):
        x = min(chunks_size, N_consensus - i)
        matched_clusters = np.where(cluster_labels == np.reshape(
            np.arange(i, min(i + chunks_size, N_consensus)), newshape=(x, 1)))
        M = np.zeros((x, N_samples))
        for j in range(x):
            coord = np.where(matched_clusters[0] == j)[0]
            M[j] = np.asarray(
                hypergraph_adjacency[matched_clusters[1][coord], :].mean(
                    axis=0))
        clb_cum[i:min(i + chunks_size, N_consensus)] = M

    # Done with collapsing the hyper-edges into a single meta-hyper-edge,
    # for each of the (N_consensus - 1) meta-clusters.

    del hypergraph_adjacency
    gc.collect()

    # Each object will now be assigned to its most associated meta-cluster.
    chunks_size = get_chunk_size(N_consensus, 4)
    N_chunks, remainder = divmod(N_samples, chunks_size)
    if N_chunks == 0:
        null_columns = np.where(clb_cum[:].sum(axis=0) == 0)[0]
    else:
        szumsz = np.zeros(0)
        for i in range(N_chunks):
            M = clb_cum[:, i * chunks_size:(i + 1) * chunks_size]
            szumsz = np.append(szumsz, M.sum(axis=0))
        if remainder != 0:
            M = clb_cum[:, N_chunks * chunks_size:N_samples]
            szumsz = np.append(szumsz, M.sum(axis=0))
        null_columns = np.where(szumsz == 0)[0]

    if null_columns.size != 0:
        print(
            "INFO: Cluster_Ensembles: MCLA: {} objects with all zero associations "
            "in 'clb_cum' matrix of meta-clusters.".format(null_columns.size))
        clb_cum[:, null_columns] = np.random.rand(N_consensus,
                                                  null_columns.size)

    random_state = np.random.RandomState()

    tmp = fileh.create_carray(fileh.root.consensus_group,
                              'tmp',
                              tables.Float32Atom(), (N_consensus, N_samples),
                              "Temporary matrix to help with "
                              "collapsing to meta-hyper-edges",
                              filters=FILTERS)

    chunks_size = get_chunk_size(N_samples, 2)
    N_chunks, remainder = divmod(N_consensus, chunks_size)
    if N_chunks == 0:
        tmp[:] = random_state.rand(N_consensus, N_samples)
    else:
        for i in range(N_chunks):
            tmp[i * chunks_size:(i + 1) * chunks_size] = random_state.rand(
                chunks_size, N_samples)
        if remainder != 0:
            tmp[N_chunks * chunks_size:N_consensus] = random_state.rand(
                remainder, N_samples)

    expr = tables.Expr("clb_cum + (tmp / 10000)")
    expr.set_output(clb_cum)
    expr.eval()

    expr = tables.Expr("abs(tmp)")
    expr.set_output(tmp)
    expr.eval()

    chunks_size = get_chunk_size(N_consensus, 2)
    N_chunks, remainder = divmod(N_samples, chunks_size)
    if N_chunks == 0:
        sum_diag = tmp[:].sum(axis=0)
    else:
        sum_diag = np.empty(0)
        for i in range(N_chunks):
            M = tmp[:, i * chunks_size:(i + 1) * chunks_size]
            sum_diag = np.append(sum_diag, M.sum(axis=0))
        if remainder != 0:
            M = tmp[:, N_chunks * chunks_size:N_samples]
            sum_diag = np.append(sum_diag, M.sum(axis=0))

    fileh.remove_node(fileh.root.consensus_group, "tmp")
    # The corresponding disk space will be freed after a call to 'fileh.close()'.

    inv_sum_diag = np.reciprocal(sum_diag.astype(float))

    if N_chunks == 0:
        clb_cum *= inv_sum_diag
        max_entries = np.amax(clb_cum, axis=0)
    else:
        max_entries = np.zeros(N_samples)
        for i in range(N_chunks):
            clb_cum[:, i * chunks_size:(i + 1) *
                    chunks_size] *= inv_sum_diag[i * chunks_size:(i + 1) *
                                                 chunks_size]
            max_entries[i * chunks_size:(i + 1) * chunks_size] = np.amax(
                clb_cum[:, i * chunks_size:(i + 1) * chunks_size], axis=0)
        if remainder != 0:
            clb_cum[:, N_chunks * chunks_size:N_samples] *= inv_sum_diag[
                N_chunks * chunks_size:N_samples]
            max_entries[N_chunks * chunks_size:N_samples] = np.amax(
                clb_cum[:, N_chunks * chunks_size:N_samples], axis=0)

    cluster_labels = np.zeros(N_samples, dtype=int)
    winner_probabilities = np.zeros(N_samples)

    chunks_size = get_chunk_size(N_samples, 2)
    for i in reversed(range(0, N_consensus, chunks_size)):
        ind = np.where(
            np.tile(max_entries, (
                min(chunks_size, N_consensus - i),
                1)) == clb_cum[i:min(i + chunks_size, N_consensus)])
        cluster_labels[ind[1]] = i + ind[0]
        winner_probabilities[ind[1]] = clb_cum[(ind[0] + i, ind[1])]

    # Done with competing for objects.

    cluster_labels = one_to_max(cluster_labels)

    print("INFO: Cluster_Ensembles: MCLA: delivering "
          "{} clusters.".format(np.unique(cluster_labels).size))
    print("INFO: Cluster_Ensembles: MCLA: average posterior "
          "probability is {}".format(np.mean(winner_probabilities)))
    if cluster_labels.size <= 7:
        print(
            "INFO: Cluster_Ensembles: MCLA: the winning posterior probabilities are:"
        )
        print(winner_probabilities)
        print(
            "'INFO: Cluster_Ensembles: MCLA: the full posterior probabilities are:"
        )
        print(clb_cum)

    fileh.remove_node(fileh.root.consensus_group, "clb_cum")
    fileh.close()

    return cluster_labels
Ejemplo n.º 10
0
def CSPA(hdf5_file_name, cluster_runs, verbose=False, N_clusters_max=None):
    """Cluster-based Similarity Partitioning Algorithm for a consensus function.
    
    Parameters
    ----------
    hdf5_file_name : file handle or string
    
    cluster_runs : array of shape (n_partitions, n_samples)
    
    verbose : bool, optional (default = False)
    
    N_clusters_max : int, optional (default = None)
    
    Returns
    -------
    A vector specifying the cluster label to which each sample has been assigned
    by the CSPA heuristics for consensus clustering.

    Reference
    ---------
    A. Strehl and J. Ghosh, "Cluster Ensembles - A Knowledge Reuse Framework
    for Combining Multiple Partitions".
    In: Journal of Machine Learning Research, 3, pp. 583-617. 2002
    """

    print('*****')
    print("INFO: Cluster_Ensembles: CSPA: consensus clustering using CSPA.")

    if N_clusters_max == None:
        N_clusters_max = int(np.nanmax(cluster_runs)) + 1

    N_runs = cluster_runs.shape[0]
    N_samples = cluster_runs.shape[1]
    if N_samples > 20000:
        raise ValueError(
            "\nERROR: Cluster_Ensembles: CSPA: cannot efficiently "
            "deal with too large a number of cells.")

    hypergraph_adjacency = load_hypergraph_adjacency(hdf5_file_name)

    s = scipy.sparse.csr_matrix.dot(hypergraph_adjacency.transpose().tocsr(),
                                    hypergraph_adjacency)
    s = np.squeeze(np.asarray(s.todense()))

    del hypergraph_adjacency
    gc.collect()

    checks(np.divide(s, float(N_runs)), verbose)

    e_sum_before = s.sum()
    sum_after = 100000000.0
    scale_factor = sum_after / float(e_sum_before)

    with tables.open_file(hdf5_file_name, 'r+') as fileh:
        atom = tables.Float32Atom()
        FILTERS = get_compression_filter(4 * (N_samples**2))

        S = fileh.create_carray(fileh.root.consensus_group,
                                'similarities_CSPA',
                                atom, (N_samples, N_samples),
                                "Matrix of similarities arising "
                                "in Cluster-based Similarity Partitioning",
                                filters=FILTERS)

        expr = tables.Expr("s * scale_factor")
        expr.set_output(S)
        expr.eval()

        chunks_size = get_chunk_size(N_samples, 3)
        for i in range(0, N_samples, chunks_size):
            tmp = S[i:min(i + chunks_size, N_samples)]
            S[i:min(i + chunks_size, N_samples)] = np.rint(tmp)

    return metis(hdf5_file_name, N_clusters_max)
Ejemplo n.º 11
0
#Create 1000 column row extendable array (EArray)
n = 1000
ear = h5.createEArray(h5.root, 'ear', atom=tb.Float64Atom(), shape=(0, n))

#populate chunkwise
time1 = time.time()
rand = np.random.standard_normal((n, n))
for i in range(750):
    ear.append(rand)
ear.flush()
print time.time() - time1
#135.915999889 seconds

#Check size logcally and pyscially
ear
ear.size_on_disk

#Create target output array
out = h5.createEArray(h5.root, 'out', atom=tb.Float64Atom(), shape=(0, n))

#Create expresion ie y=3sin(x) +sqrt(abs(x))
expr = tb.Expr('3*sin(ear)+sqrt(abs(ear))')
expr.setOutput(out, append_mode=True)

time1 = time.time()
expr.eval()
print time.time() - time1
#187.100000143 seconds

h5.close()
Ejemplo n.º 12
0
def MCLA(cluster_runs, verbose=False, N_clusters_max=None):

    cluster_ensemble = []
    score = np.empty(0)

    if N_clusters_max == None:
        N_clusters_max = int(np.nanmax(cluster_runs)) + 1

    N_runs = cluster_runs.shape[0]
    N_samples = cluster_runs.shape[1]

    #Cluster_Ensembles: MCLA: preparing graph for meta-clustering.
    hypergraph_adjacency = build_hypergraph_adjacency(cluster_runs)
    w = hypergraph_adjacency.sum(axis=1)

    N_rows = hypergraph_adjacency.shape[0]

    # Next, obtain a matrix of pairwise Jaccard similarity scores between the rows of the hypergraph adjacency matrix.

    scale_factor = 100.0

    #starting computation of Jaccard similarity matrix.
    squared_MCLA = hypergraph_adjacency.dot(hypergraph_adjacency.transpose())

    squared_sums = hypergraph_adjacency.sum(axis=1)
    squared_sums = np.squeeze(np.asarray(squared_sums))

    chunks_size = get_chunk_size(N_rows, 7)
    for i in range(0, N_rows, chunks_size):
        n_dim = min(chunks_size, N_rows - i)

        temp = squared_MCLA[i:min(i + chunks_size, N_rows), :].todense()
        temp = np.squeeze(np.asarray(temp))

        x = squared_sums[i:min(i + chunks_size, N_rows)]
        x = x.reshape(-1, 1)
        x = np.dot(x, np.ones((1, squared_sums.size)))

        y = np.dot(np.ones((n_dim, 1)), squared_sums.reshape(1, -1))

        temp = np.divide(temp, x + y - temp)
        temp *= scale_factor

        Jaccard_matrix = np.rint(temp)
        # print(Jaccard_matrix)

        # del Jaccard_matrix, temp, x, y
        del temp, x, y
        gc.collect()

    # Done computing the matrix of pairwise Jaccard similarity scores.

    ####################################################################################################

    e_mat = Jaccard_matrix
    # print(e_mat[0])

    # print(N_rows)
    N_cols = e_mat.shape[1]

    w *= scale_factor
    w = np.rint(w)
    vwgt = []
    for sublist in w.tolist():
        for item in sublist:
            vwgt.append(int(item))

    # print(vwgt)

    diag_ind = np.diag_indices(N_rows)
    e_mat[diag_ind] = 0

    adjncy = []
    adjwgt = []
    xadj = []
    xadjind = 0
    xadj.append(0)  #first element always starts with 0

    chunks_size = get_chunk_size(N_cols, 7)
    for i in range(0, N_rows, chunks_size):
        M = e_mat[i:min(i + chunks_size, N_rows)]

        for j in range(M.shape[0]):
            edges = np.where(M[j] > 0)[0]
            weights = M[j, edges]

            xadjind += edges.size
            xadj.append(xadjind)

            adjncy.extend(edges)
            adjwgt.extend(weights)

    adjwgt = list(map(int, adjwgt))

    # max_w = np.max(vwgt)
    # min_w = np.min(vwgt)
    # vwgt_norm = (vwgt-min_w)/(max_w-min_w)
    # print("vwgt : ", vwgt)
    # print("vwgt_norm : ", vwgt_norm+1)

    # print("adjwgt : ", adjwgt)

    # N_rows = 12
    # N_clusters_max = 10
    # xadj = [0, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, ]
    # adjncy = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ]
    # vwgt = [90300, 200, 11600, 11400, 9600, 11600, 7200, 8600, 9700, 5800, 7100, 7900, ]
    # adjwgt = [13, 13, 11, 13, 8, 10, 11, 6, 8, 9, 2, 13, 13, 11, 13, 2, 8, 10, 11, 6, 8, 9, ]

    # print(cluster_runs)
    # print("xadj: ", xadj)
    # print("adjncy : ", adjncy)
    # print("vwgt : ", vwgt)
    # print("adjwgt : ", adjwgt)
    # print("\n")

    xadj = (idx_t * len(xadj))(*xadj)
    adjncy = (idx_t * len(adjncy))(*adjncy)
    adjwgt = (idx_t * len(adjwgt))(*adjwgt)
    vwgt = (idx_t * len(vwgt))(*vwgt)

    ncon = idx_t(1)

    G = METIS_Graph(idx_t(N_rows), ncon, xadj, adjncy, vwgt, None, adjwgt)
    # print(G)

    (edgecuts, parts) = metis.part_graph(G, N_clusters_max)
    cluster_labels = parts

    # print("parts")
    # print(parts)

    ##########################################################################################################

    cluster_labels = one_to_max(cluster_labels)
    # print(cluster_labels)
    # After 'metis' returns, we are done with clustering hyper-edges

    # We are now ready to start the procedure meant to collapse meta-clusters.
    N_consensus = np.amax(cluster_labels) + 1

    clb_cum = np.zeros(shape=(N_consensus, N_samples))

    chunks_size = get_chunk_size(N_samples, 7)
    for i in range(0, N_consensus, chunks_size):
        x = min(chunks_size, N_consensus - i)
        matched_clusters = np.where(cluster_labels == np.reshape(
            np.arange(i, min(i + chunks_size, N_consensus)), newshape=(x, 1)))
        M = np.zeros((x, N_samples))
        for j in range(x):
            coord = np.where(matched_clusters[0] == j)[0]
            M[j] = np.asarray(
                hypergraph_adjacency[matched_clusters[1][coord], :].mean(
                    axis=0))
        clb_cum[i:min(i + chunks_size, N_consensus)] = M

    # Done with collapsing the hyper-edges into a single meta-hyper-edge,
    # for each of the (N_consensus - 1) meta-clusters.

    del hypergraph_adjacency
    gc.collect()

    # print(clb_cum[10])

    # Each object will now be assigned to its most associated meta-cluster.
    chunks_size = get_chunk_size(N_consensus, 4)
    N_chunks, remainder = divmod(N_samples, chunks_size)
    if N_chunks == 0:
        null_columns = np.where(clb_cum[:].sum(axis=0) == 0)[0]
    else:
        szumsz = np.zeros(0)
        for i in range(N_chunks):
            M = clb_cum[:, i * chunks_size:(i + 1) * chunks_size]
            szumsz = np.append(szumsz, M.sum(axis=0))
        if remainder != 0:
            M = clb_cum[:, N_chunks * chunks_size:N_samples]
            szumsz = np.append(szumsz, M.sum(axis=0))
        null_columns = np.where(szumsz == 0)[0]

    if null_columns.size != 0:
        # print("INFO: Cluster_Ensembles: MCLA: {} objects with all zero associations "
        #       "in 'clb_cum' matrix of meta-clusters.".format(null_columns.size))
        clb_cum[:, null_columns] = np.random.rand(N_consensus,
                                                  null_columns.size)

    random_state = np.random.RandomState()

    tmp = np.zeros(shape=(N_consensus, N_samples))

    chunks_size = get_chunk_size(N_samples, 2)
    N_chunks, remainder = divmod(N_consensus, chunks_size)
    if N_chunks == 0:
        tmp[:] = random_state.rand(N_consensus, N_samples)
    else:
        for i in range(N_chunks):
            tmp[i * chunks_size:(i + 1) * chunks_size] = random_state.rand(
                chunks_size, N_samples)
        if remainder != 0:
            tmp[N_chunks * chunks_size:N_consensus] = random_state.rand(
                remainder, N_samples)

    expr = tables.Expr("clb_cum + (tmp / 10000)")
    expr.set_output(clb_cum)
    expr.eval()

    expr = tables.Expr("abs(tmp)")
    expr.set_output(tmp)
    expr.eval()

    chunks_size = get_chunk_size(N_consensus, 2)
    N_chunks, remainder = divmod(N_samples, chunks_size)
    if N_chunks == 0:
        sum_diag = tmp[:].sum(axis=0)
    else:
        sum_diag = np.empty(0)
        for i in range(N_chunks):
            M = tmp[:, i * chunks_size:(i + 1) * chunks_size]
            sum_diag = np.append(sum_diag, M.sum(axis=0))
        if remainder != 0:
            M = tmp[:, N_chunks * chunks_size:N_samples]
            sum_diag = np.append(sum_diag, M.sum(axis=0))

    inv_sum_diag = np.reciprocal(sum_diag.astype(float))

    if N_chunks == 0:
        clb_cum *= inv_sum_diag
        max_entries = np.amax(clb_cum, axis=0)
    else:
        max_entries = np.zeros(N_samples)
        for i in range(N_chunks):
            clb_cum[:, i * chunks_size:(i + 1) *
                    chunks_size] *= inv_sum_diag[i * chunks_size:(i + 1) *
                                                 chunks_size]
            max_entries[i * chunks_size:(i + 1) * chunks_size] = np.amax(
                clb_cum[:, i * chunks_size:(i + 1) * chunks_size], axis=0)
        if remainder != 0:
            clb_cum[:, N_chunks * chunks_size:N_samples] *= inv_sum_diag[
                N_chunks * chunks_size:N_samples]
            max_entries[N_chunks * chunks_size:N_samples] = np.amax(
                clb_cum[:, N_chunks * chunks_size:N_samples], axis=0)

    cluster_labels = np.zeros(N_samples, dtype=int)
    winner_probabilities = np.zeros(N_samples)

    chunks_size = get_chunk_size(N_samples, 2)
    for i in reversed(range(0, N_consensus, chunks_size)):
        ind = np.where(
            np.tile(max_entries, (
                min(chunks_size, N_consensus - i),
                1)) == clb_cum[i:min(i + chunks_size, N_consensus)])
        cluster_labels[ind[1]] = i + ind[0]
        winner_probabilities[ind[1]] = clb_cum[(ind[0] + i, ind[1])]

    # Done with competing for objects.

    cluster_labels = one_to_max(cluster_labels)

    return cluster_labels
Ejemplo n.º 13
0
def pandas_io():
    data = np.random.standard_normal((1000, 5)).round(5)
    # sample data set
    filename = path + 'numbs'
    # query = 'CREATE TABLE numbers (No1 real, No2 real,\
    #     No3 real, No4 real, No5 real)'
    con = sq3.Connection(filename + '.db')
    
    # Dont want to do these every run
    # con.execute(query)
    # con.executemany('INSERT INTO numbers VALUES (?, ?, ?, ?, ?)', data)
    # con.commit()
    temp = con.execute('SELECT * FROM numbers').fetchall()
    print(temp[:2])
    
    query = 'SELECT * FROM numbers WHERE No1 > 0 AND No2 < 0'
    res = np.array(con.execute(query).fetchall()).round(3)
    plt.plot(res[:, 0], res[:, 1], 'ro')
    plt.grid(True); plt.xlim(-0.5, 4.5); plt.ylim(-4.5, 0.5)
    plt.savefig(PNG_PATH + 'query.png', dpi=300)
    plt.close()
    
    data = pds.read_sql('SELECT * FROM numbers', con)
    print(data.head())
    print(data[(data['No1'] > 0) & (data['No2'] < 0)].head())
    
    res = data[['No1', 'No2']][((data['No1'] > 0.5) | (data['No1'] < -0.5))
                     & ((data['No2'] < -1) | (data['No2'] > 1))]
    plt.plot(res.No1, res.No2, 'ro')
    plt.grid(True); plt.axis('tight')
    plt.savefig(PNG_PATH + 'x_scatter.png', dpi=300)
    plt.close()
    
    h5s = pd.HDFStore(filename + '.h5s', 'w')
    h5s['data'] = data
    print(h5s)
    h5s.close()
    
    h5s = pd.HDFStore(filename + '.h5s', 'r')
    temp = h5s['data']
    h5s.close()
    
    np.allclose(np.array(temp), np.array(data))
    
    data.to_csv(filename + '.csv')
    # REMEMBER THISSSSSSSSSSSSSSSSSSS
    # Can do mpl on pandas or numpy and then just call plot() and savefig
    a = pd.read_csv(filename + '.csv')[['No1', 'No2',
                                'No3', 'No4']].hist(bins=20)
    plt.plot()
    plt.savefig(PNG_PATH + 'hist.png', dpi=300)
    plt.close()
    
    data[:1000].to_excel(filename + '.xlsx')
    pd.read_excel(filename + '.xlsx', 'Sheet1').cumsum().plot()
    plt.plot()
    plt.savefig(PNG_PATH + 'excel.png', dpi=300)
    plt.close()
    
    filename = path + 'tab.h5'
    h5 = tb.open_file(filename, 'w')
    rows = 1000
    row_des = {
    'Date': tb.StringCol(26, pos=1),
    'No1': tb.IntCol(pos=2),
    'No2': tb.IntCol(pos=3),
    'No3': tb.Float64Col(pos=4),
    'No4': tb.Float64Col(pos=5)
    }
    filters = tb.Filters(complevel=0)  # no compression
    tab = h5.create_table('/', 'ints_floats', row_des,
                      title='Integers and Floats',
                      expectedrows=rows, filters=filters)
    print(tab)
    
    pointer = tab.row
    ran_int = np.random.randint(0, 10000, size=(rows, 2))
    ran_flo = np.random.standard_normal((rows, 2)).round(5)
    for i in range(rows):
        pointer['Date'] = dt.datetime.now()
        pointer['No1'] = ran_int[i, 0]
        pointer['No2'] = ran_int[i, 1] 
        pointer['No3'] = ran_flo[i, 0]
        pointer['No4'] = ran_flo[i, 1] 
        pointer.append()
        # this appends the data and
        # moves the pointer one row forward
    tab.flush()  # flush = commit in sqlite
    print(tab)
    
    
    dty = np.dtype([('Date', 'S26'), ('No1', '<i4'), ('No2', '<i4'),
                                 ('No3', '<f8'), ('No4', '<f8')])
    sarray = np.zeros(len(ran_int), dtype=dty)
    sarray['Date'] = dt.datetime.now()
    sarray['No1'] = ran_int[:, 0]
    sarray['No2'] = ran_int[:, 1]
    sarray['No3'] = ran_flo[:, 0]
    sarray['No4'] = ran_flo[:, 1]
    h5.create_table('/', 'ints_floats_from_array', sarray,
                      title='Integers and Floats',
                      expectedrows=rows, filters=filters)
    print(h5)
    h5.remove_node('/', 'ints_floats_from_array')
    print(tab[:3])
    print(tab[:4]['No4'])
    print(np.sum(tab[:]['No3']))
    print(np.sum(np.sqrt(tab[:]['No1'])))
    plt.hist(tab[:]['No3'], bins=30)
    plt.grid(True)
    print(len(tab[:]['No3']))
    plt.plot()
    plt.savefig(PNG_PATH + 'h5.png', dpi=300)
    plt.close()
    
    res = np.array([(row['No3'], row['No4']) for row in
        tab.where('((No3 < -0.05) | (No3 > 0.05)) \
                 & ((No4 < -0.1) | (No4 > 0.1))')])[::100]
    plt.plot(res.T[0], res.T[1], 'ro')
    plt.grid(True)
    plt.savefig(PNG_PATH + 'h5_x.png', dpi=300)
    plt.close()
    
    values = tab.cols.No3[:]
    print("Max %18.3f" % values.max())
    print("Ave %18.3f" % values.mean())
    print("Min %18.3f" % values.min())
    print("Std %18.3f" % values.std())
    results = [(row['No1'], row['No2']) for row in
               tab.where('((No1 > 9800) | (No1 < 200)) \
                        & ((No2 > 4500) & (No2 < 5500))')]
    for res in results[:4]:
        print(res)
        
    results = [(row['No1'], row['No2']) for row in
           tab.where('(No1 == 1234) & (No2 > 9776)')]
    for res in results:
        print(res)
    
    filename = path + 'tab.h5c'
    h5c = tb.open_file(filename, 'w')
    filters = tb.Filters(complevel=4, complib='blosc')
    tabc = h5c.create_table('/', 'ints_floats', sarray,
                            title='Integers and Floats',
                          expectedrows=rows, filters=filters)
    res = np.array([(row['No3'], row['No4']) for row in
                 tabc.where('((No3 < -0.5) | (No3 > 0.5)) \
                           & ((No4 < -1) | (No4 > 1))')])[::100]
    arr_non = tab.read()
    arr_com = tabc.read()
    h5c.close()
    
    arr_int = h5.create_array('/', 'integers', ran_int)
    arr_flo = h5.create_array('/', 'floats', ran_flo)
    print(h5)
    h5.close()
    
    filename = path + 'array.h5'
    h5 = tb.open_file(filename, 'w')
    
    n = 100
    ear = h5.create_earray(h5.root, 'ear',
                      atom=tb.Float64Atom(),
                      shape=(0, n))
    rand = np.random.standard_normal((n, n))
    for i in range(750):
        ear.append(rand)
    ear.flush()
    print(ear)
    print(ear.size_on_disk)
    out = h5.create_earray(h5.root, 'out',
                      atom=tb.Float64Atom(),
                      shape=(0, n))
    expr = tb.Expr('3 * sin(ear) + sqrt(abs(ear))')
    expr.set_output(out, append_mode=True)
    print(expr.eval())
    imarray = ear.read()
    
    import numexpr as ne
    expr = '3 * sin(imarray) + sqrt(abs(imarray))'
    ne.set_num_threads(16)
    print(ne.evaluate(expr)[0, :10])
    h5.close()
Ejemplo n.º 14
0
def compute_up(c, t, **kwargs):
    uservars = dict((col, getattr(t.cols, col)) for col in c.active_columns())
    e = tb.Expr(str(c._expr), uservars=uservars, truediv=True)
    return e.eval()