Beispiel #1
0
def standardize(x, M=None, S=None, REVERSE=None):
    """ Function that standardize the data
        Input:
            x: the data
            M: the mean vector
            V: the standard deviation vector
        Output:
            x: the standardize data
            M: the mean vector
            V: the standard deviation vector
    """
    if not sp.issubdtype(x.dtype, float):
        do_convert = 1
    else:
        do_convert = 0
    if REVERSE is None:
        if M is None:
            M = sp.mean(x, axis=0)
            S = sp.std(x, axis=0)
            if do_convert:
                xs = (x.astype("float") - M) / S
            else:
                xs = (x - M) / S
            return xs, M, S
        else:
            if do_convert:
                xs = (x.astype("float") - M) / S
            else:
                xs = (x - M) / S
            return xs
    else:
        return S * x + M
Beispiel #2
0
def scale(x, M=None, m=None, REVERSE=None):
    """ Function that standardize the data
        Input:
            x: the data
            M: the Max vector
            m: the Min vector
        Output:
            x: the standardize data
            M: the Max vector
            m: the Min vector
    """
    if not sp.issubdtype(x.dtype, float):
        do_convert = 1
    else:
        do_convert = 0
    if REVERSE is None:
        if M is None:
            M = sp.amax(x, axis=0)
            m = sp.amin(x, axis=0)
            if do_convert:
                xs = 2 * (x.astype("float") - m) / (M - m) - 1
            else:
                xs = 2 * (x - m) / (M - m) - 1
            return xs, M, m
        else:
            if do_convert:
                xs = 2 * (x.astype("float") - m) / (M - m) - 1
            else:
                xs = 2 * (x - m) / (M - m) - 1
            return xs
    else:
        return (1 + x) / 2 * (M - m) + m
Beispiel #3
0
def scale(x, M=None, m=None, REVERSE=None):
    ''' Function that standardize the data
        Input:
            x: the data
            M: the Max vector
            m: the Min vector
        Output:
            x: the standardize data
            M: the Max vector
            m: the Min vector
    '''
    if not sp.issubdtype(x.dtype, float):
        do_convert = 1
    else:
        do_convert = 0
    if REVERSE is None:
        if M is None:
            M = sp.amax(x, axis=0)
            m = sp.amin(x, axis=0)
            if do_convert:
                xs = 2 * (x.astype('float') - m) / (M - m) - 1
            else:
                xs = 2 * (x - m) / (M - m) - 1
            return xs, M, m
        else:
            if do_convert:
                xs = 2 * (x.astype('float') - m) / (M - m) - 1
            else:
                xs = 2 * (x - m) / (M - m) - 1
            return xs
    else:
        return (1 + x) / 2 * (M - m) + m
Beispiel #4
0
    def scale(self, x, M=None, m=None):  # TODO:  DO IN PLACE SCALING
        """!@brief Function that standardize the data
        
            Input:
                x: the data
                M: the Max vector
                m: the Min vector
            Output:
                x: the standardize data
                M: the Max vector
                m: the Min vector
        """
        [n, d] = x.shape
        if not sp.issubdtype(x.dtype, float):
            x = x.astype('float')

        # Initialization of the output
        xs = sp.empty_like(x)

        # get the parameters of the scaling
        if M is None:
            M, m = sp.amax(x, axis=0), sp.amin(x, axis=0)

        den = M - m
        for i in range(d):
            if den[i] != 0:
                xs[:, i] = 2 * (x[:, i] - m[i]) / den[i] - 1
            else:
                xs[:, i] = x[:, i]

        return xs
Beispiel #5
0
def standardize(x, M=None, S=None, REVERSE=None):
    ''' Function that standardize the data
        Input:
            x: the data
            M: the mean vector
            V: the standard deviation vector
        Output:
            x: the standardize data
            M: the mean vector
            V: the standard deviation vector
    '''
    if not sp.issubdtype(x.dtype, float):
        do_convert = 1
    else:
        do_convert = 0
    if REVERSE is None:
        if M is None:
            M = sp.mean(x, axis=0)
            S = sp.std(x, axis=0)
            if do_convert:
                xs = (x.astype('float') - M) / S
            else:
                xs = (x - M) / S
            return xs, M, S
        else:
            if do_convert:
                xs = (x.astype('float') - M) / S
            else:
                xs = (x - M) / S
            return xs
    else:
        return S * x + M
 def scale(self,x,M=None,m=None):  # TODO:  DO IN PLACE SCALING
     """!@brief Function that standardize the data
     
         Input:
             x: the data
             M: the Max vector
             m: the Min vector
         Output:
             x: the standardize data
             M: the Max vector
             m: the Min vector
     """
     [n,d]=x.shape
     if not sp.issubdtype(x.dtype,float):
         x=x.astype('float')
 
     # Initialization of the output
     xs = sp.empty_like(x)
 
     # get the parameters of the scaling
     if M is None:
         M,m = sp.amax(x,axis=0),sp.amin(x,axis=0)
         
     den = M-m
     for i in range(d):
         if den[i] != 0:
             xs[:,i] = 2*(x[:,i]-m[i])/den[i]-1
         else:
             xs[:,i]=x[:,i]
 
     return xs
Beispiel #7
0
def _compare_gene(a, b):
    if sp.issubdtype(a.strain.dtype, sp.str_):
        _astrain = _codeUTF8(a.strain)
    else:
        _astrain = a.strain
    if sp.issubdtype(b.strain.dtype, sp.str_):
        _bstrain = _codeUTF8(b.strain)
    else:
        _bstrain = b.strain

    return ((a.chr == b.chr) &
            (a.strand == b.strand) &
            (sp.all(a.exons1 == b.exons1)) &
            (sp.all(a.exons2 == b.exons2)) &
            (sp.all(_astrain == _bstrain)) &
            (a.event_type == b.event_type) &
            (a.gene_idx == b.gene_idx) &
            (a.num_detected == b.num_detected))
Beispiel #8
0
def _compare_gene(a, b):
    if sp.issubdtype(a.strain.dtype, sp.str_):
        _astrain = _codeUTF8(a.strain)
    else:
        _astrain = a.strain
    if sp.issubdtype(b.strain.dtype, sp.str_):
        _bstrain = _codeUTF8(b.strain)
    else:
        _bstrain = b.strain

    return ((a.chr == b.chr) &
            (a.strand == b.strand) &
            (sp.all(a.exons1 == b.exons1)) &
            (sp.all(a.exons2 == b.exons2)) &
            (sp.all(_astrain == _bstrain)) &
            (a.event_type == b.event_type) &
            (a.gene_idx == b.gene_idx) &
            (a.num_detected == b.num_detected))
Beispiel #9
0
    assert sp.array_equal(sp.array([1, 2, 3], dtype=sp.int_),
                          sp.int_([1, 2, 3]))

    # Different types evaluate to equal sp.arrays

    assert sp.array_equal(sp.array([1, 2, 3], dtype=sp.int_),
                          sp.array([1, 2, 3], dtype=sp.float_))

    # Get type

    v = sp.array([1, 2], dtype=sp.int32)
    assert v.dtype == sp.int32

    # Subtype:

    sp.issubdtype(sp.int32, sp.int_)

    # Convert type

    v = sp.array([1, 2], dtype=sp.int32)
    vf = v.astype(sp.float_)
    assert vf.dtype == sp.float_

    ### type_ vs dtype

    # `type_` is the same as using the dtype arg.

    # That said, *always use the sp.array* methods without dtye for uniformity

    # And if you need explicit type, use the dtype arg.
Beispiel #10
0
def count_graph_coverage_wrapper(fname_in,
                                 fname_out,
                                 options,
                                 sample_idx=None,
                                 qmode='all'):

    (genes, inserted) = pickle.load(open(fname_in, 'rb'))
    for g in genes:
        g.from_sparse()

    if genes[0].segmentgraph is None or genes[0].segmentgraph.is_empty():
        for g in genes:
            g.segmentgraph = Segmentgraph(g)
            g.to_sparse()
        pickle.dump((genes, inserted), open(fname_in, 'wb'), -1)
        for g in genes:
            g.from_sparse()

    counts = dict()
    counts['segments'] = []
    counts['seg_pos'] = []
    counts['gene_ids_segs'] = []
    counts['edges'] = []
    counts['gene_ids_edges'] = []
    counts['seg_len'] = sp.hstack([
        x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :]
        for x in genes
    ]).T
    counts['gene_names'] = sp.array([x.name for x in genes], dtype='str')

    if not options.pyproc:
        if options.merge == 'single':
            print('\nprocessing %s' % (options.samples[sample_idx]))
            counts_tmp = count_graph_coverage(genes,
                                              options.bam_fnames[sample_idx],
                                              options)
        elif options.merge == 'merge_graphs' and qmode == 'single':
            print(
                '\nquantifying merged graph in single mode (first file only) on %s'
                % options.samples[0])
            counts_tmp = count_graph_coverage(genes, options.bam_fnames[0],
                                              options)
        else:
            for s_idx in range(options.strains.shape[0]):
                print('\n%i/%i' % (s_idx + 1, options.strains.shape[0]))
                if s_idx == 0:
                    counts_tmp = count_graph_coverage(
                        genes, options.bam_fnames[s_idx], options)
                else:
                    counts_tmp = sp.r_[
                        sp.atleast_2d(counts_tmp),
                        count_graph_coverage(genes, options.
                                             bam_fnames[s_idx], options)]

        for c in range(counts_tmp.shape[1]):
            counts['segments'].append(
                sp.hstack(
                    [sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]))
            counts['seg_pos'].append(
                sp.hstack(
                    [sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]))
            counts['gene_ids_segs'].append(
                sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1),
                        dtype='int') * c)
            tmp = [
                sp.atleast_2d(x.edges) for x in counts_tmp[:, c]
                if x.edges.shape[0] > 0
            ]
            if len(tmp) == 0:
                continue
            tmp = sp.hstack(tmp)
            if tmp.shape[0] > 0:
                counts['edges'].append(
                    sp.c_[tmp[:, 0], tmp[:, sp.arange(1, tmp.shape[1], 2)]])
                counts['gene_ids_edges'].append(
                    sp.ones((tmp.shape[0], 1), dtype='int') * c)

        ### write result data to hdf5
        for key in counts:
            counts[key] = sp.vstack(
                counts[key]) if len(counts[key]) > 0 else counts[key]
        counts['edge_idx'] = counts['edges'][:, 0] if len(
            counts['edges']) > 0 else sp.array([])
        counts['edges'] = counts['edges'][:, 1:] if len(
            counts['edges']) > 0 else sp.array([])
        h5fid = h5py.File(fname_out, 'w')
        h5fid.create_dataset(name='strains', data=codeUTF8(options.strains))
        for key in counts:
            if sp.issubdtype(counts[key].dtype, sp.str_):
                h5fid.create_dataset(name=key, data=codeUTF8(counts[key]))
            else:
                h5fid.create_dataset(name=key, data=counts[key])
        h5fid.close()
    else:
        ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains)
        if options.sparse_bam:
            chunksize = int(max(1, math.floor(1000000 / len(options.strains))))
        else:
            chunksize = int(max(1, math.floor(100000 / len(options.strains))))

        jobinfo = []

        PAR = dict()
        PAR['options'] = options
        if options.merge == 'single':
            PAR['options'].bam_fnames = PAR['options'].bam_fnames[sample_idx]
            PAR['options'].samples = PAR['options'].samples[sample_idx]
            PAR['options'].strains = PAR['options'].strains[sample_idx]

        #s_idx = sp.argsort([x.chr for x in genes]) # TODO
        s_idx = sp.arange(genes.shape[0])
        for c_idx in range(0, s_idx.shape[0], chunksize):
            cc_idx = min(s_idx.shape[0], c_idx + chunksize)
            fn = re.sub(r'.hdf5$', '',
                        fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx)
            if os.path.exists(fn):
                continue
            else:
                print('submitting chunk %i to %i (%i)' %
                      (c_idx, cc_idx, s_idx.shape[0]))
                PAR['genes'] = genes[s_idx][c_idx:cc_idx]
                for gg in PAR['genes']:
                    gg.to_sparse()
                PAR['fn_bam'] = options.bam_fnames
                PAR['fn_out'] = fn
                PAR['options'] = options
                jobinfo.append(
                    rp.rproc('count_graph_coverage', PAR, 15000,
                             options.options_rproc, 60 * 48))

        rp.rproc_wait(jobinfo, 30, 1.0, -1)
        del genes

        ### merge results from count chunks
        if options.verbose:
            print('\nCollecting count data from chunks ...\n')
            print('writing data to %s' % fname_out)

        ### write data to hdf5 continuously
        h5fid = h5py.File(fname_out, 'w')
        h5fid.create_dataset(name='gene_names',
                             data=codeUTF8(counts['gene_names']))
        h5fid.create_dataset(name='seg_len', data=counts['seg_len'])
        h5fid.create_dataset(name='strains', data=codeUTF8(options.strains))
        for c_idx in range(0, s_idx.shape[0], chunksize):
            cc_idx = min(s_idx.shape[0], c_idx + chunksize)
            if options.verbose:
                print('collecting chunk %i-%i (%i)' %
                      (c_idx, cc_idx, s_idx.shape[0]))
            fn = re.sub(r'.hdf5$', '',
                        fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx)
            if not os.path.exists(fn):
                print(
                    'ERROR: Not all chunks in counting graph coverage completed!',
                    file=sys.stderr)
                sys.exit(1)
            else:
                counts_tmp = pickle.load(open(fn, 'rb'))
                for c in range(counts_tmp.shape[1]):
                    if 'segments' in h5fid:
                        appendToHDF5(
                            h5fid,
                            sp.hstack([
                                sp.atleast_2d(x.segments).T
                                for x in counts_tmp[:, c]
                            ]), 'segments')
                        appendToHDF5(
                            h5fid,
                            sp.hstack([
                                sp.atleast_2d(x.seg_pos).T
                                for x in counts_tmp[:, c]
                            ]), 'seg_pos')
                        appendToHDF5(
                            h5fid,
                            sp.ones((sp.atleast_2d(
                                counts_tmp[0, c].seg_pos).shape[1], 1),
                                    dtype='int') * (s_idx[c_idx + c]),
                            'gene_ids_segs')
                    else:
                        h5fid.create_dataset(name='segments',
                                             data=sp.hstack([
                                                 sp.atleast_2d(x.segments).T
                                                 for x in counts_tmp[:, c]
                                             ]),
                                             chunks=True,
                                             compression='gzip',
                                             maxshape=(None,
                                                       len(options.strains)))
                        h5fid.create_dataset(name='seg_pos',
                                             data=sp.hstack([
                                                 sp.atleast_2d(x.seg_pos).T
                                                 for x in counts_tmp[:, c]
                                             ]),
                                             chunks=True,
                                             compression='gzip',
                                             maxshape=(None,
                                                       len(options.strains)))
                        h5fid.create_dataset(
                            name='gene_ids_segs',
                            data=sp.ones((sp.atleast_2d(
                                counts_tmp[0, c].seg_pos).shape[1], 1),
                                         dtype='int') * (s_idx[c_idx + c]),
                            chunks=True,
                            compression='gzip',
                            maxshape=(None, 1))

                    tmp = [
                        sp.atleast_2d(x.edges) for x in counts_tmp[:, c]
                        if x.edges.shape[0] > 0
                    ]
                    if len(tmp) == 0:
                        continue
                    tmp = sp.hstack(tmp)
                    if tmp.shape[0] > 0:
                        if 'edges' in h5fid:
                            appendToHDF5(h5fid,
                                         tmp[:,
                                             sp.arange(1, tmp.shape[1], 2)],
                                         'edges')
                            appendToHDF5(h5fid, tmp[:, 0], 'edge_idx')
                            appendToHDF5(
                                h5fid,
                                sp.ones((tmp.shape[0], 1), dtype='int') *
                                (s_idx[c_idx + c]), 'gene_ids_edges')
                        else:
                            h5fid.create_dataset(
                                name='edges',
                                data=tmp[:, sp.arange(1, tmp.shape[1], 2)],
                                chunks=True,
                                compression='gzip',
                                maxshape=(None, tmp.shape[1] / 2))
                            h5fid.create_dataset(name='edge_idx',
                                                 data=tmp[:, 0],
                                                 chunks=True,
                                                 compression='gzip',
                                                 maxshape=(None, ))
                            h5fid.create_dataset(
                                name='gene_ids_edges',
                                data=sp.ones((tmp.shape[0], 1), dtype='int') *
                                (s_idx[c_idx + c]),
                                chunks=True,
                                compression='gzip',
                                maxshape=(None, 1))
                del tmp, counts_tmp
        h5fid.close()
Beispiel #11
0
    # Different types evaluate to equal sp.arrays

    assert sp.array_equal(
        sp.array([1, 2, 3], dtype = sp.int_  ),
        sp.array([1, 2, 3], dtype = sp.float_)
    )

    # Get type

    v = sp.array([1,2], dtype = sp.int32)
    assert v.dtype == sp.int32

    # Subtype:

    sp.issubdtype(sp.int32, sp.int_)

    # Convert type

    v = sp.array([1,2], dtype = sp.int32)
    vf = v.astype(sp.float_)
    assert vf.dtype == sp.float_

    ### type_ vs dtype

    # `type_` is the same as using the dtype arg.

    # That said, *always use the sp.array* methods without dtye for uniformity

    # And if you need explicit type, use the dtype arg.