コード例 #1
0
    def _test_dna(self, chromo):
        pos = self.pos[self.chromo == chromo.encode()]
        dna = self.data['/inputs/dna'][self.chromo == chromo.encode()]
        dna_wlen = dna.shape[1]
        center = dna_wlen // 2

        dna_seq = read_chromo(os.path.join(self.data_path, '../dna_db'),
                              chromo)

        idxs = np.linspace(0, len(pos) - 1, 100).astype(np.int32)
        for idx in idxs:
            p = pos[idx] - 1
            assert dna_seq[p:(p + 2)] == 'CG'
            assert dna[idx, center] == 3
            assert dna[idx, center + 1] == 2
            assert dna[idx, center + 10] == CHAR_TO_INT[dna_seq[p + 10]]
            assert dna[idx, center - 10] == CHAR_TO_INT[dna_seq[p - 10]]
コード例 #2
0
    def _test_dna(self, chromo):
        pos = self.pos[self.chromo == chromo.encode()]
        dna = self.data['/inputs/dna'][self.chromo == chromo.encode()]
        dna_wlen = dna.shape[1]
        center = dna_wlen // 2

        dna_seq = read_chromo(os.path.join(self.data_path, '../dna_db'),
                              chromo)

        idxs = np.linspace(0, len(pos) - 1, 100).astype(np.int32)
        for idx in idxs:
            p = pos[idx] - 1
            assert dna_seq[p:(p + 2)] == 'CG'
            assert dna[idx, center] == 3
            assert dna[idx, center + 1] == 2
            assert dna[idx, center + 10] == CHAR_TO_INT[dna_seq[p + 10]]
            assert dna[idx, center - 10] == CHAR_TO_INT[dna_seq[p - 10]]
コード例 #3
0
ファイル: dcpg_data.py プロジェクト: JieYang031/deepcpg
    def main(self, name, opts):
        if opts.seed is not None:
            np.random.seed(opts.seed)

        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        # Check input arguments
        if not opts.cpg_profiles:
            if not (opts.pos_file or opts.dna_files):
                raise ValueError('Position table and DNA database expected!')

        if opts.dna_wlen and opts.dna_wlen % 2 == 0:
            raise '--dna_wlen must be odd!'
        if opts.cpg_wlen and opts.cpg_wlen % 2 != 0:
            raise '--cpg_wlen must be even!'

        # Parse functions for computing output statistics
        cpg_stats_meta = None
        win_stats_meta = None
        if opts.cpg_stats:
            cpg_stats_meta = get_stats_meta(opts.cpg_stats)
        if opts.win_stats:
            win_stats_meta = get_stats_meta(opts.win_stats)

        make_dir(opts.out_dir)
        outputs = OrderedDict()

        # Read single-cell profiles if provided
        if opts.cpg_profiles:
            log.info('Reading CpG profiles ...')
            outputs['cpg'] = read_cpg_profiles(
                opts.cpg_profiles,
                chromos=opts.chromos,
                nb_sample=opts.nb_sample,
                nb_sample_chromo=opts.nb_sample_chromo,
                log=log.info)

        # Create table with unique positions
        if opts.pos_file:  #the pos_file provide the CpG positions which need to be predicted
            # Read positions from file
            log.info('Reading position table ...')
            pos_table = pd.read_table(opts.pos_file,
                                      usecols=[0, 1],
                                      dtype={
                                          0: str,
                                          1: np.int32
                                      },
                                      header=None,
                                      comment='#')
            pos_table.columns = ['chromo', 'pos']
            pos_table['chromo'] = dat.format_chromo(pos_table['chromo'])
            pos_table = prepro_pos_table(pos_table)
        else:
            # Extract positions from profiles, if not provided. Predict position which available in at least one cells.
            pos_tables = []
            for cpg_table in list(outputs['cpg'].values()):
                pos_tables.append(cpg_table[['chromo', 'pos']])
            pos_table = prepro_pos_table(pos_tables)

        if opts.chromos:
            pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)]
        if opts.nb_sample_chromo:
            pos_table = dat.sample_from_chromo(pos_table,
                                               opts.nb_sample_chromo)
        if opts.nb_sample:
            pos_table = pos_table.iloc[:opts.nb_sample]

        log.info('%d samples' % len(pos_table))

        make_dir(opts.out_dir)

        # Iterate over chromosomes
        # ------------------------
        for chromo in pos_table.chromo.unique():
            log.info('-' * 80)
            log.info('Chromosome %s ...' % (chromo))
            idx = pos_table.chromo == chromo  ##idx is T/F for whether the entries are equal to the chromo
            chromo_pos = pos_table.loc[
                idx].pos.values  #a numpy array with 1D data
            chromo_outputs = OrderedDict()

            if 'cpg' in outputs:
                # Concatenate CpG tables into single nb_site x nb_output matrix
                chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo,
                                                       chromo_pos)
                #chromo_outputs, one array called 'cpg', 'cpg' has #sample array,
                #each item is mapped table of target_pos with value filled
                #OrderedDict([('BS27_1_SER', array([1, 1, 1, ..., 1, 1, 0], dtype=int8)),
                #('BS27_3_SER', array([-1,  1,  1, ...,  1, -1, -1], dtype=int8))])
                chromo_outputs['cpg_mat'] = np.vstack(
                    list(chromo_outputs['cpg'].values())).T
                #add one more array to it. np.vstack, stack array sequence vertically
                #chromo_outputs['cpg_mat'].shape=(402166, 2)
                #402166 is the CHR1 target pos number, 2 is the input two samples, BS27_1_SER, BS27_3_SER
                assert len(chromo_outputs['cpg_mat']) == len(chromo_pos)

            if 'cpg_mat' in chromo_outputs and opts.cpg_cov:
                cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1)
                assert np.all(cov >= 1)
                idx = cov >= opts.cpg_cov
                tmp = '%s sites matched minimum coverage filter'
                tmp %= format_out_of(idx.sum(), len(idx))
                log.info(tmp)
                if idx.sum() == 0:
                    continue

                chromo_pos = chromo_pos[idx]
                chromo_outputs = select_dict(chromo_outputs, idx)

            # Read DNA of chromosome
            chromo_dna = None
            if opts.dna_files:  #this will only read the corresponding chromosome sequence
                chromo_dna = fasta.read_chromo(
                    opts.dna_files,
                    chromo)  #chromo_dna is string, len=195471971 for chr1

            annos = None
            if opts.anno_files:
                log.info('Annotating CpG sites ...')
                annos = dict()
                for anno_file in opts.anno_files:
                    name = split_ext(anno_file)
                    annos[name] = annotate(anno_file, chromo, chromo_pos)

            # Iterate over chunks
            # -------------------
            nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size))
            for chunk in range(nb_chunk):
                log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk))
                chunk_start = chunk * opts.chunk_size
                chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size)
                chunk_idx = slice(chunk_start, chunk_end)
                chunk_pos = chromo_pos[chunk_idx]

                chunk_outputs = select_dict(chromo_outputs,
                                            chunk_idx)  #OrderedDict()
                #chunk_outputs is 1D array

                filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start,
                                                 chunk_end)
                filename = os.path.join(opts.out_dir, filename)
                chunk_file = h5.File(filename, 'w')

                # Write positions
                chunk_file.create_dataset(
                    'chromo', shape=(len(chunk_pos), ),
                    dtype='S2')  #create_dataset() in default for h5py
                chunk_file['chromo'][:] = chromo.encode(
                )  #set the chunk_file['chromo'] = 1 for all.
                #chunk_file['chromo'].shape = (32768,)
                chunk_file.create_dataset('pos',
                                          data=chunk_pos,
                                          dtype=np.int32)
                #chunk_file['pos'].shape = (32768,) # the size is default chunk_size

                if len(chunk_outputs):  #len(chunk_outputs)=2
                    out_group = chunk_file.create_group('outputs')
                    #for now, type(out_group) = <class 'h5py._hl.group.Group'>
                    #list(out_group) = []

                # Write cpg profiles
                if 'cpg' in chunk_outputs:
                    for name, value in six.iteritems(chunk_outputs['cpg']):
                        #name = ["BS27_1_SER", 'BS27_3_SER'] # the sample name
                        #value= 2 numpy array, both with shape=(32768,)
                        assert len(value) == len(chunk_pos)
                        # Round continuous values
                        out_group.create_dataset('cpg/%s' % name,
                                                 data=value.round(),
                                                 dtype=np.int8,
                                                 compression='gzip')
                        #type(out_group)= <class 'h5py._hl.group.Group'>
                        #list(out_group) = ['cpg']
                        #list(out_group['cpg']) = ['BS27_1_SER', 'BS27_3_SER']

                    # Compute and write statistics
                    if cpg_stats_meta is not None:
                        log.info('Computing per CpG statistics ...')
                        cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'],
                                                      dat.CPG_NAN)
                        #cpg_mat.shape=(32768, 2)
                        mask = np.sum(~cpg_mat.mask, axis=1)
                        mask = mask < opts.cpg_stats_cov
                        for name, fun in six.iteritems(cpg_stats_meta):
                            stat = fun[0](cpg_mat).data.astype(fun[1])
                            stat[mask] = dat.CPG_NAN
                            assert len(stat) == len(chunk_pos)
                            out_group.create_dataset('cpg_stats/%s' % name,
                                                     data=stat,
                                                     dtype=fun[1],
                                                     compression='gzip')

#until here:


#>>> chunk_file.visit(printname)
#chromo
#outputs
#outputs/cpg
#outputs/cpg/BS27_1_SER
#utputs/cpg/BS27_3_SER
#pos

# Write input features
                in_group = chunk_file.create_group('inputs')

                # DNA windows
                if chromo_dna:
                    log.info('Extracting DNA sequence windows ...')
                    dna_wins = extract_seq_windows(chromo_dna,
                                                   pos=chunk_pos,
                                                   wlen=opts.dna_wlen)
                    #give the fasta sequence of one chromosome ('chromo_dna'), and targeted position ('chunk_pos')
                    #, and wlen=1001, return a numpy array with shape as (32768, 1001). The array has been transfered as
                    #number rather than base pair
                    assert len(dna_wins) == len(chunk_pos)
                    in_group.create_dataset('dna',
                                            data=dna_wins,
                                            dtype=np.int8,
                                            compression='gzip')
                    #>>> in_group.visit(printname) = dna

                # CpG neighbors
                if opts.cpg_wlen:
                    log.info('Extracting CpG neighbors ...')
                    cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2)
                    context_group = in_group.create_group('cpg')
                    # outputs['cpg'], since neighboring CpG sites might lie
                    # outside chunk borders and un-mapped values are needed
                    for name, cpg_table in six.iteritems(outputs['cpg']):
                        #name="BS27_1_SER" and "BS27_3_SER"
                        #cpg_table = numpy array, with three columns information for each input sample.
                        cpg_table = cpg_table.loc[cpg_table.chromo == chromo]
                        state, dist = cpg_ext.extract(
                            chunk_pos, cpg_table.pos.values,
                            cpg_table.value.values
                        )  #extract the cpg distance and state with wlen
                        nan = np.isnan(state)
                        state[
                            nan] = dat.CPG_NAN  #set nan value as -1, which means unknown
                        dist[nan] = dat.CPG_NAN
                        # States can be binary (np.int8) or continuous
                        # (np.float32).
                        state = state.astype(cpg_table.value.dtype,
                                             copy=False)  #set data type
                        dist = dist.astype(np.float32, copy=False)

                        assert len(state) == len(chunk_pos)
                        assert len(dist) == len(chunk_pos)
                        assert np.all((dist > 0) | (dist == dat.CPG_NAN))

                        group = context_group.create_group(name)
                        group.create_dataset('state',
                                             data=state,
                                             compression='gzip')
                        group.create_dataset('dist',
                                             data=dist,
                                             compression='gzip')
                        #list(group) = ['state','dist']

                if win_stats_meta is not None and opts.cpg_wlen:
                    log.info('Computing window-based statistics ...')
                    states = []
                    dists = []
                    cpg_states = []
                    cpg_group = out_group['cpg']
                    context_group = in_group['cpg']
                    for output_name in six.iterkeys(cpg_group):
                        state = context_group[output_name]['state'].value
                        states.append(np.expand_dims(state, 2))
                        dist = context_group[output_name]['dist'].value
                        dists.append(np.expand_dims(dist, 2))
                        cpg_states.append(cpg_group[output_name].value)
                    # samples x outputs x cpg_wlen
                    states = np.swapaxes(np.concatenate(states, axis=2), 1, 2)
                    dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2)
                    cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2)
                    cpg_dists = np.zeros_like(cpg_states)
                    states = np.concatenate([states, cpg_states], axis=2)
                    dists = np.concatenate([dists, cpg_dists], axis=2)

                    for wlen in opts.win_stats_wlen:
                        idx = (states == dat.CPG_NAN) | (dists > wlen // 2)
                        states_wlen = np.ma.masked_array(states, idx)
                        group = out_group.create_group('win_stats/%d' % wlen)
                        for name, fun in six.iteritems(win_stats_meta):
                            stat = fun[0](states_wlen)
                            if hasattr(stat, 'mask'):
                                idx = stat.mask
                                stat = stat.data
                                if np.sum(idx):
                                    stat[idx] = dat.CPG_NAN
                            group.create_dataset(name,
                                                 data=stat,
                                                 dtype=fun[1],
                                                 compression='gzip')

                if annos:
                    log.info('Adding annotations ...')
                    group = in_group.create_group('annos')
                    for name, anno in six.iteritems(annos):
                        group.create_dataset(name,
                                             data=anno[chunk_idx],
                                             dtype='int8',
                                             compression='gzip')

                chunk_file.close()

        log.info('Done!')
        return 0
コード例 #4
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        # Check input arguments
        if not (opts.cpg_profiles or opts.bulk_profiles):
            if not (opts.pos_file or opts.dna_files):
                raise ValueError('Position table and DNA database expected!')

        if opts.dna_wlen and opts.dna_wlen % 2 == 0:
            raise '--dna_wlen must be odd!'
        if opts.cpg_wlen and opts.cpg_wlen % 2 != 0:
            raise '--cpg_wlen must be even!'

        # Parse functions for computing output statistics
        cpg_stats_meta = None
        win_stats_meta = None
        if opts.stats:
            cpg_stats_meta = get_stats_meta(opts.stats)
        if opts.win_stats:
            win_stats_meta = get_stats_meta(opts.win_stats)

        make_dir(opts.out_dir)
        outputs = OrderedDict()

        # Read single-cell profiles if provided
        if opts.cpg_profiles:
            log.info('Reading single-cell profiles ...')
            outputs['cpg'] = read_cpg_profiles(opts.cpg_profiles,
                                               chromos=opts.chromos,
                                               nb_sample=opts.nb_sample)

        if opts.bulk_profiles:
            log.info('Reading bulk profiles ...')
            outputs['bulk'] = read_cpg_profiles(opts.bulk_profiles,
                                                chromos=opts.chromos,
                                                nb_sample=opts.nb_sample,
                                                round=False)

        # Create table with unique positions
        if opts.pos_file:
            # Read positions from file
            log.info('Reading position table ...')
            pos_table = pd.read_table(opts.pos_file,
                                      usecols=[0, 1],
                                      dtype={
                                          0: str,
                                          1: np.int32
                                      },
                                      header=None,
                                      comment='#')
            pos_table.columns = ['chromo', 'pos']
            pos_table['chromo'] = dat.format_chromo(pos_table['chromo'])
            pos_table = prepro_pos_table(pos_table)
        else:
            # Extract positions from profiles
            pos_tables = []
            for cpg_table in list(outputs['cpg'].values()):
                pos_tables.append(cpg_table[['chromo', 'pos']])
            pos_table = prepro_pos_table(pos_tables)

        if opts.chromos:
            pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)]
        if opts.nb_sample:
            pos_table = pos_table.iloc[:opts.nb_sample]

        log.info('%d samples' % len(pos_table))

        make_dir(opts.out_dir)

        # Iterate over chromosomes
        # ------------------------
        for chromo in pos_table.chromo.unique():
            log.info('-' * 80)
            log.info('Chromosome %s ...' % (chromo))
            idx = pos_table.chromo == chromo
            chromo_pos = pos_table.loc[idx].pos.values
            chromo_outputs = OrderedDict()

            if 'cpg' in outputs:
                # Concatenate CpG tables into single nb_site x nb_output matrix
                chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo,
                                                       chromo_pos)
                chromo_outputs['cpg_mat'] = np.vstack(
                    list(chromo_outputs['cpg'].values())).T
                assert len(chromo_outputs['cpg_mat']) == len(chromo_pos)

            if 'bulk' in outputs:
                # Concatenate CpG tables into single nb_site x nb_output matrix
                chromo_outputs['bulk'] = map_cpg_tables(
                    outputs['bulk'], chromo, chromo_pos)

            if 'cpg_mat' in chromo_outputs and opts.cpg_cov:
                cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1)
                assert np.all(cov >= 1)
                idx = cov >= opts.cpg_cov
                tmp = '%s sites matched minimum coverage filter'
                tmp %= format_out_of(idx.sum(), len(idx))
                log.info(tmp)
                if idx.sum() == 0:
                    continue

                chromo_pos = chromo_pos[idx]
                chromo_outputs = select_dict(chromo_outputs, idx)

            # Read DNA of chromosome
            chromo_dna = None
            if opts.dna_files:
                chromo_dna = fasta.read_chromo(opts.dna_files, chromo)

            annos = None
            if opts.anno_files:
                log.info('Annotating CpG sites ...')
                annos = dict()
                for anno_file in opts.anno_files:
                    name = split_ext(anno_file)
                    annos[name] = annotate(anno_file, chromo, chromo_pos)

            # Iterate over chunks
            # -------------------
            nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size))
            for chunk in range(nb_chunk):
                log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk))
                chunk_start = chunk * opts.chunk_size
                chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size)
                chunk_idx = slice(chunk_start, chunk_end)
                chunk_pos = chromo_pos[chunk_idx]

                chunk_outputs = select_dict(chromo_outputs, chunk_idx)

                filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start,
                                                 chunk_end)
                filename = os.path.join(opts.out_dir, filename)
                chunk_file = h5.File(filename, 'w')

                # Write positions
                chunk_file.create_dataset('chromo',
                                          shape=(len(chunk_pos), ),
                                          dtype='S2')
                chunk_file['chromo'][:] = chromo.encode()
                chunk_file.create_dataset('pos',
                                          data=chunk_pos,
                                          dtype=np.int32)

                if len(chunk_outputs):
                    out_group = chunk_file.create_group('outputs')

                # Write cpg profiles
                if 'cpg' in chunk_outputs:
                    for name, value in chunk_outputs['cpg'].items():
                        assert len(value) == len(chunk_pos)
                        out_group.create_dataset('cpg/%s' % name,
                                                 data=value,
                                                 dtype=np.int8,
                                                 compression='gzip')
                    # Compute and write statistics
                    if cpg_stats_meta is not None:
                        log.info('Computing per CpG statistics ...')
                        cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'],
                                                      dat.CPG_NAN)
                        mask = np.sum(~cpg_mat.mask, axis=1)
                        mask = mask < opts.stats_cov
                        for name, fun in cpg_stats_meta.items():
                            stat = fun[0](cpg_mat).data.astype(fun[1])
                            stat[mask] = dat.CPG_NAN
                            assert len(stat) == len(chunk_pos)
                            out_group.create_dataset('stats/%s' % name,
                                                     data=stat,
                                                     dtype=fun[1],
                                                     compression='gzip')

                # Write bulk profiles
                if 'bulk' in chunk_outputs:
                    for name, value in chunk_outputs['bulk'].items():
                        assert len(value) == len(chunk_pos)
                        out_group.create_dataset('bulk/%s' % name,
                                                 data=value,
                                                 dtype=np.float32,
                                                 compression='gzip')

                # Write input features
                in_group = chunk_file.create_group('inputs')

                # DNA windows
                if chromo_dna:
                    log.info('Extracting DNA sequence windows ...')
                    dna_wins = extract_seq_windows(chromo_dna,
                                                   pos=chunk_pos,
                                                   wlen=opts.dna_wlen)
                    assert len(dna_wins) == len(chunk_pos)
                    in_group.create_dataset('dna',
                                            data=dna_wins,
                                            dtype=np.int8,
                                            compression='gzip')

                # CpG neighbors
                if opts.cpg_wlen:
                    log.info('Extracting CpG neighbors ...')
                    cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2)
                    context_group = in_group.create_group('cpg')
                    # outputs['cpg'], since neighboring CpG sites might lie
                    # outside chunk borders and un-mapped values are needed
                    for name, cpg_table in outputs['cpg'].items():
                        cpg_table = cpg_table.loc[cpg_table.chromo == chromo]
                        state, dist = cpg_ext.extract(chunk_pos,
                                                      cpg_table.pos.values,
                                                      cpg_table.value.values)
                        nan = np.isnan(state)
                        state[nan] = dat.CPG_NAN
                        dist[nan] = dat.CPG_NAN
                        state = state.astype(np.int8, copy=False)
                        dist = dist.astype(np.float32, copy=False)

                        assert len(state) == len(chunk_pos)
                        assert np.all((state == 0) | (state == 1)
                                      | (state == dat.CPG_NAN))
                        assert len(dist) == len(chunk_pos)
                        assert np.all((dist > 0) | (dist == dat.CPG_NAN))

                        group = context_group.create_group(name)
                        group.create_dataset('state',
                                             data=state,
                                             compression='gzip')
                        group.create_dataset('dist',
                                             data=dist,
                                             compression='gzip')

                if win_stats_meta is not None and opts.cpg_wlen:
                    log.info('Computing window-based statistics ...')
                    states = []
                    dists = []
                    cpg_states = []
                    cpg_group = out_group['cpg']
                    context_group = in_group['cpg']
                    for output_name in cpg_group.keys():
                        state = context_group[output_name]['state'].value
                        states.append(np.expand_dims(state, 2))
                        dist = context_group[output_name]['dist'].value
                        dists.append(np.expand_dims(dist, 2))
                        cpg_states.append(cpg_group[output_name].value)
                    # samples x outputs x cpg_wlen
                    states = np.swapaxes(np.concatenate(states, axis=2), 1, 2)
                    dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2)
                    cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2)
                    cpg_dists = np.zeros_like(cpg_states)
                    states = np.concatenate([states, cpg_states], axis=2)
                    dists = np.concatenate([dists, cpg_dists], axis=2)

                    for wlen in opts.win_stats_wlen:
                        idx = (states == dat.CPG_NAN) | (dists > wlen // 2)
                        states_wlen = np.ma.masked_array(states, idx)
                        group = out_group.create_group('win_stats/%d' % wlen)
                        for name, fun in win_stats_meta.items():
                            stat = fun[0](states_wlen)
                            if hasattr(stat, 'mask'):
                                idx = stat.mask
                                stat = stat.data
                                if np.sum(idx):
                                    stat[idx] = dat.CPG_NAN
                            group.create_dataset(name,
                                                 data=stat,
                                                 dtype=fun[1],
                                                 compression='gzip')

                if annos:
                    log.info('Adding annotations ...')
                    group = in_group.create_group('annos')
                    for name, anno in annos.items():
                        group.create_dataset(name,
                                             data=anno[chunk_idx],
                                             dtype='int8',
                                             compression='gzip')

                chunk_file.close()

        log.info('Done!')
        return 0
コード例 #5
0
ファイル: dcpg_data.py プロジェクト: cangermueller/deepcpg
    def main(self, name, opts):
        if opts.seed is not None:
            np.random.seed(opts.seed)

        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        # Check input arguments
        if not opts.cpg_profiles:
            if not (opts.pos_file or opts.dna_files):
                raise ValueError('Position table and DNA database expected!')

        if opts.dna_wlen and opts.dna_wlen % 2 == 0:
            raise '--dna_wlen must be odd!'
        if opts.cpg_wlen and opts.cpg_wlen % 2 != 0:
            raise '--cpg_wlen must be even!'

        # Parse functions for computing output statistics
        cpg_stats_meta = None
        win_stats_meta = None
        if opts.cpg_stats:
            cpg_stats_meta = get_stats_meta(opts.cpg_stats)
        if opts.win_stats:
            win_stats_meta = get_stats_meta(opts.win_stats)

        make_dir(opts.out_dir)
        outputs = OrderedDict()

        # Read single-cell profiles if provided
        if opts.cpg_profiles:
            log.info('Reading CpG profiles ...')
            outputs['cpg'] = read_cpg_profiles(
                opts.cpg_profiles,
                chromos=opts.chromos,
                nb_sample=opts.nb_sample,
                nb_sample_chromo=opts.nb_sample_chromo,
                log=log.info)

        # Create table with unique positions
        if opts.pos_file:
            # Read positions from file
            log.info('Reading position table ...')
            pos_table = pd.read_table(opts.pos_file, usecols=[0, 1],
                                      dtype={0: str, 1: np.int32},
                                      header=None, comment='#')
            pos_table.columns = ['chromo', 'pos']
            pos_table['chromo'] = dat.format_chromo(pos_table['chromo'])
            pos_table = prepro_pos_table(pos_table)
        else:
            # Extract positions from profiles
            pos_tables = []
            for cpg_table in list(outputs['cpg'].values()):
                pos_tables.append(cpg_table[['chromo', 'pos']])
            pos_table = prepro_pos_table(pos_tables)

        if opts.chromos:
            pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)]
        if opts.nb_sample_chromo:
            pos_table = dat.sample_from_chromo(pos_table, opts.nb_sample_chromo)
        if opts.nb_sample:
            pos_table = pos_table.iloc[:opts.nb_sample]

        log.info('%d samples' % len(pos_table))

        make_dir(opts.out_dir)

        # Iterate over chromosomes
        # ------------------------
        for chromo in pos_table.chromo.unique():
            log.info('-' * 80)
            log.info('Chromosome %s ...' % (chromo))
            idx = pos_table.chromo == chromo
            chromo_pos = pos_table.loc[idx].pos.values
            chromo_outputs = OrderedDict()

            if 'cpg' in outputs:
                # Concatenate CpG tables into single nb_site x nb_output matrix
                chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'],
                                                       chromo, chromo_pos)
                chromo_outputs['cpg_mat'] = np.vstack(
                    list(chromo_outputs['cpg'].values())).T
                assert len(chromo_outputs['cpg_mat']) == len(chromo_pos)

            if 'cpg_mat' in chromo_outputs and opts.cpg_cov:
                cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1)
                assert np.all(cov >= 1)
                idx = cov >= opts.cpg_cov
                tmp = '%s sites matched minimum coverage filter'
                tmp %= format_out_of(idx.sum(), len(idx))
                log.info(tmp)
                if idx.sum() == 0:
                    continue

                chromo_pos = chromo_pos[idx]
                chromo_outputs = select_dict(chromo_outputs, idx)

            # Read DNA of chromosome
            chromo_dna = None
            if opts.dna_files:
                chromo_dna = fasta.read_chromo(opts.dna_files, chromo)

            annos = None
            if opts.anno_files:
                log.info('Annotating CpG sites ...')
                annos = dict()
                for anno_file in opts.anno_files:
                    name = split_ext(anno_file)
                    annos[name] = annotate(anno_file, chromo, chromo_pos)

            # Iterate over chunks
            # -------------------
            nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size))
            for chunk in range(nb_chunk):
                log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk))
                chunk_start = chunk * opts.chunk_size
                chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size)
                chunk_idx = slice(chunk_start, chunk_end)
                chunk_pos = chromo_pos[chunk_idx]

                chunk_outputs = select_dict(chromo_outputs, chunk_idx)

                filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start, chunk_end)
                filename = os.path.join(opts.out_dir, filename)
                chunk_file = h5.File(filename, 'w')

                # Write positions
                chunk_file.create_dataset('chromo', shape=(len(chunk_pos),),
                                          dtype='S2')
                chunk_file['chromo'][:] = chromo.encode()
                chunk_file.create_dataset('pos', data=chunk_pos, dtype=np.int32)

                if len(chunk_outputs):
                    out_group = chunk_file.create_group('outputs')

                # Write cpg profiles
                if 'cpg' in chunk_outputs:
                    for name, value in six.iteritems(chunk_outputs['cpg']):
                        assert len(value) == len(chunk_pos)
                        # Round continuous values
                        out_group.create_dataset('cpg/%s' % name,
                                                 data=value.round(),
                                                 dtype=np.int8,
                                                 compression='gzip')
                    # Compute and write statistics
                    if cpg_stats_meta is not None:
                        log.info('Computing per CpG statistics ...')
                        cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'],
                                                      dat.CPG_NAN)
                        mask = np.sum(~cpg_mat.mask, axis=1)
                        mask = mask < opts.cpg_stats_cov
                        for name, fun in six.iteritems(cpg_stats_meta):
                            stat = fun[0](cpg_mat).data.astype(fun[1])
                            stat[mask] = dat.CPG_NAN
                            assert len(stat) == len(chunk_pos)
                            out_group.create_dataset('cpg_stats/%s' % name,
                                                     data=stat,
                                                     dtype=fun[1],
                                                     compression='gzip')

                # Write input features
                in_group = chunk_file.create_group('inputs')

                # DNA windows
                if chromo_dna:
                    log.info('Extracting DNA sequence windows ...')
                    dna_wins = extract_seq_windows(chromo_dna, pos=chunk_pos,
                                                   wlen=opts.dna_wlen)
                    assert len(dna_wins) == len(chunk_pos)
                    in_group.create_dataset('dna', data=dna_wins, dtype=np.int8,
                                            compression='gzip')

                # CpG neighbors
                if opts.cpg_wlen:
                    log.info('Extracting CpG neighbors ...')
                    cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2)
                    context_group = in_group.create_group('cpg')
                    # outputs['cpg'], since neighboring CpG sites might lie
                    # outside chunk borders and un-mapped values are needed
                    for name, cpg_table in six.iteritems(outputs['cpg']):
                        cpg_table = cpg_table.loc[cpg_table.chromo == chromo]
                        state, dist = cpg_ext.extract(chunk_pos,
                                                      cpg_table.pos.values,
                                                      cpg_table.value.values)
                        nan = np.isnan(state)
                        state[nan] = dat.CPG_NAN
                        dist[nan] = dat.CPG_NAN
                        # States can be binary (np.int8) or continuous
                        # (np.float32).
                        state = state.astype(cpg_table.value.dtype, copy=False)
                        dist = dist.astype(np.float32, copy=False)

                        assert len(state) == len(chunk_pos)
                        assert len(dist) == len(chunk_pos)
                        assert np.all((dist > 0) | (dist == dat.CPG_NAN))

                        group = context_group.create_group(name)
                        group.create_dataset('state', data=state,
                                             compression='gzip')
                        group.create_dataset('dist', data=dist,
                                             compression='gzip')

                if win_stats_meta is not None and opts.cpg_wlen:
                    log.info('Computing window-based statistics ...')
                    states = []
                    dists = []
                    cpg_states = []
                    cpg_group = out_group['cpg']
                    context_group = in_group['cpg']
                    for output_name in six.iterkeys(cpg_group):
                        state = context_group[output_name]['state'].value
                        states.append(np.expand_dims(state, 2))
                        dist = context_group[output_name]['dist'].value
                        dists.append(np.expand_dims(dist, 2))
                        cpg_states.append(cpg_group[output_name].value)
                    # samples x outputs x cpg_wlen
                    states = np.swapaxes(np.concatenate(states, axis=2), 1, 2)
                    dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2)
                    cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2)
                    cpg_dists = np.zeros_like(cpg_states)
                    states = np.concatenate([states, cpg_states], axis=2)
                    dists = np.concatenate([dists, cpg_dists], axis=2)

                    for wlen in opts.win_stats_wlen:
                        idx = (states == dat.CPG_NAN) | (dists > wlen // 2)
                        states_wlen = np.ma.masked_array(states, idx)
                        group = out_group.create_group('win_stats/%d' % wlen)
                        for name, fun in six.iteritems(win_stats_meta):
                            stat = fun[0](states_wlen)
                            if hasattr(stat, 'mask'):
                                idx = stat.mask
                                stat = stat.data
                                if np.sum(idx):
                                    stat[idx] = dat.CPG_NAN
                            group.create_dataset(name, data=stat, dtype=fun[1],
                                                 compression='gzip')

                if annos:
                    log.info('Adding annotations ...')
                    group = in_group.create_group('annos')
                    for name, anno in six.iteritems(annos):
                        group.create_dataset(name, data=anno[chunk_idx],
                                             dtype='int8',
                                             compression='gzip')

                chunk_file.close()

        log.info('Done!')
        return 0
コード例 #6
0
def run_dcpg_data(pos_file = None,
                    cpg_profiles = None,
                    dna_files = None,
                    cpg_wlen=None,
                    cpg_cov = 1,
                    dna_wlen=1001,
                    anno_files=None,
                    chromos = None,
                    nb_sample = None,
                    nb_sample_chromo = None,
                    chunk_size = 32768,
                    seed = 0,
                    verbose = False):
    if seed is not None:
        np.random.seed(seed)


    # FIXME
    name = "dcpg_data"
    logging.basicConfig(format='%(levelname)s (%(asctime)s): %(message)s')
    log = logging.getLogger(name)
    if verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.INFO)

    # Check input arguments
    if not cpg_profiles:
        if not (pos_file or dna_files):
            raise ValueError('Position table and DNA database expected!')

    if dna_wlen and dna_wlen % 2 == 0:
        raise 'dna_wlen must be odd!'
    if cpg_wlen and cpg_wlen % 2 != 0:
        raise 'cpg_wlen must be even!'

    """
    # Parse functions for computing output statistics
    cpg_stats_meta = None
    win_stats_meta = None
    if cpg_stats:
        cpg_stats_meta = get_stats_meta(cpg_stats)
    if win_stats:
        win_stats_meta = get_stats_meta(win_stats)
    """

    outputs = OrderedDict()

    # Read single-cell profiles if provided
    if cpg_profiles:
        log.info('Reading CpG profiles ...')
        outputs['cpg'] = read_cpg_profiles(
            cpg_profiles,
            chromos=chromos,
            nb_sample=nb_sample,
            nb_sample_chromo=nb_sample_chromo,
            log=log.info)

    # Create table with unique positions
    if pos_file:
        # Read positions from file
        log.info('Reading position table ...')
        pos_table = pd.read_table(pos_file, usecols=[0, 1],
                                  dtype={0: str, 1: np.int32},
                                  header=None, comment='#')
        pos_table.columns = ['chromo', 'pos']
        pos_table['chromo'] = dat.format_chromo(pos_table['chromo'])
        pos_table = prepro_pos_table(pos_table)
    else:
        # Extract positions from profiles
        pos_tables = []
        for cpg_table in list(outputs['cpg'].values()):
            pos_tables.append(cpg_table[['chromo', 'pos']])
        pos_table = prepro_pos_table(pos_tables)

    if chromos:
        pos_table = pos_table.loc[pos_table.chromo.isin(chromos)]
    if nb_sample_chromo:
        pos_table = dat.sample_from_chromo(pos_table, nb_sample_chromo)
    if nb_sample:
        pos_table = pos_table.iloc[:nb_sample]

    log.info('%d samples' % len(pos_table))


    # Iterate over chromosomes
    # ------------------------
    for chromo in pos_table.chromo.unique():
        log.info('-' * 80)
        log.info('Chromosome %s ...' % (chromo))
        idx = pos_table.chromo == chromo
        chromo_pos = pos_table.loc[idx].pos.values
        chromo_outputs = OrderedDict()

        if 'cpg' in outputs:
            # Concatenate CpG tables into single nb_site x nb_output matrix
            chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'],
                                                   chromo, chromo_pos)
            chromo_outputs['cpg_mat'] = np.vstack(
                list(chromo_outputs['cpg'].values())).T
            assert len(chromo_outputs['cpg_mat']) == len(chromo_pos)

        if 'cpg_mat' in chromo_outputs and cpg_cov:
            cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1)
            assert np.all(cov >= 1)
            idx = cov >= cpg_cov
            tmp = '%s sites matched minimum coverage filter'
            tmp %= format_out_of(idx.sum(), len(idx))
            log.info(tmp)
            if idx.sum() == 0:
                continue

            chromo_pos = chromo_pos[idx]
            chromo_outputs = select_dict(chromo_outputs, idx)

        # Read DNA of chromosome
        chromo_dna = None
        if dna_files:
            chromo_dna = fasta.read_chromo(dna_files, chromo)

        annos = None
        if anno_files:
            log.info('Annotating CpG sites ...')
            annos = dict()
            for anno_file in anno_files:
                name = split_ext(anno_file)
                annos[name] = annotate(anno_file, chromo, chromo_pos)

        # Iterate over chunks
        # -------------------
        nb_chunk = int(np.ceil(len(chromo_pos) / chunk_size))
        for chunk in range(nb_chunk):
            log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk))
            chunk_start = chunk * chunk_size
            chunk_end = min(len(chromo_pos), chunk_start + chunk_size)
            chunk_idx = slice(chunk_start, chunk_end)
            chunk_pos = chromo_pos[chunk_idx]

            chunk_outputs = select_dict(chromo_outputs, chunk_idx)

            #filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start, chunk_end)
            #filename = os.path.join(out_dir, filename)
            #chunk_file = h5.File(filename, 'w')

            # Write positions
            #chunk_file.create_dataset('chromo', shape=(len(chunk_pos),),
            #                          dtype='S2')
            #chunk_file['chromo'][:] = chromo.encode()
            #chunk_file.create_dataset('pos', data=chunk_pos, dtype=np.int32)

            yield_dict = {}

            yield_dict["chromo"] = np.array([chromo.encode()]*len(chunk_pos), dtype='S2')
            yield_dict["pos"] = np.array(chunk_pos, dtype=np.int32)


            if len(chunk_outputs):
                #out_group = chunk_file.create_group('outputs')
                yield_dict["outputs"] = {}
                out_group = yield_dict["outputs"]


            # Write cpg profiles
            if 'cpg' in chunk_outputs:
                yield_dict["outputs"]['cpg']={}
                for name, value in six.iteritems(chunk_outputs['cpg']):
                    assert len(value) == len(chunk_pos)
                    # Round continuous values
                    #out_group.create_dataset('cpg/%s' % name,
                    #                         data=value.round(),
                    #                         dtype=np.int8,
                    #                         compression='gzip')
                    out_group['cpg'][name] = np.array(value.round(), np.int8)
                """
                # Compute and write statistics
                if cpg_stats_meta is not None:
                    log.info('Computing per CpG statistics ...')
                    cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'],
                                                  dat.CPG_NAN)
                    mask = np.sum(~cpg_mat.mask, axis=1)
                    mask = mask < cpg_stats_cov
                    for name, fun in six.iteritems(cpg_stats_meta):
                        stat = fun[0](cpg_mat).data.astype(fun[1])
                        stat[mask] = dat.CPG_NAN
                        assert len(stat) == len(chunk_pos)
                        out_group.create_dataset('cpg_stats/%s' % name,
                                                 data=stat,
                                                 dtype=fun[1],
                                                 compression='gzip')
                """

            # Write input features
            #in_group = chunk_file.create_group('inputs')
            yield_dict["inputs"] = {}
            in_group = yield_dict["inputs"]

            # DNA windows
            if chromo_dna:
                log.info('Extracting DNA sequence windows ...')
                dna_wins = extract_seq_windows(chromo_dna, pos=chunk_pos,
                                               wlen=dna_wlen)
                assert len(dna_wins) == len(chunk_pos)
                #in_group.create_dataset('dna', data=dna_wins, dtype=np.int8,
                #                        compression='gzip')
                in_group['dna'] = np.array(dna_wins, dtype=np.int8)

            # CpG neighbors
            if cpg_wlen:
                log.info('Extracting CpG neighbors ...')
                cpg_ext = fext.KnnCpgFeatureExtractor(cpg_wlen // 2)
                #context_group = in_group.create_group('cpg')
                in_group['cpg'] = {}
                context_group = in_group['cpg']
                # outputs['cpg'], since neighboring CpG sites might lie
                # outside chunk borders and un-mapped values are needed
                for name, cpg_table in six.iteritems(outputs['cpg']):
                    cpg_table = cpg_table.loc[cpg_table.chromo == chromo]
                    state, dist = cpg_ext.extract(chunk_pos,
                                                  cpg_table.pos.values,
                                                  cpg_table.value.values)
                    nan = np.isnan(state)
                    state[nan] = dat.CPG_NAN
                    dist[nan] = dat.CPG_NAN
                    # States can be binary (np.int8) or continuous
                    # (np.float32).
                    state = state.astype(cpg_table.value.dtype, copy=False)
                    dist = dist.astype(np.float32, copy=False)

                    assert len(state) == len(chunk_pos)
                    assert len(dist) == len(chunk_pos)
                    assert np.all((dist > 0) | (dist == dat.CPG_NAN))

                    #group = context_group.create_group(name)
                    #group.create_dataset('state', data=state,
                    #                     compression='gzip')
                    #group.create_dataset('dist', data=dist,
                    #                     compression='gzip')
                    context_group[name] = {'state': state, 'dist':dist}

            """
            if win_stats_meta is not None and cpg_wlen:
                log.info('Computing window-based statistics ...')
                states = []
                dists = []
                cpg_states = []
                cpg_group = out_group['cpg']
                context_group = in_group['cpg']
                for output_name in six.iterkeys(cpg_group):
                    state = context_group[output_name]['state']#.value
                    states.append(np.expand_dims(state, 2))
                    dist = context_group[output_name]['dist']#.value
                    dists.append(np.expand_dims(dist, 2))
                    #cpg_states.append(cpg_group[output_name].value)
                    cpg_states.append(cpg_group[output_name])
                # samples x outputs x cpg_wlen
                states = np.swapaxes(np.concatenate(states, axis=2), 1, 2)
                dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2)
                cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2)
                cpg_dists = np.zeros_like(cpg_states)
                states = np.concatenate([states, cpg_states], axis=2)
                dists = np.concatenate([dists, cpg_dists], axis=2)

                for wlen in win_stats_wlen:
                    idx = (states == dat.CPG_NAN) | (dists > wlen // 2)
                    states_wlen = np.ma.masked_array(states, idx)
                    group = out_group.create_group('win_stats/%d' % wlen)
                    for name, fun in six.iteritems(win_stats_meta):
                        stat = fun[0](states_wlen)
                        if hasattr(stat, 'mask'):
                            idx = stat.mask
                            stat = stat.data
                            if np.sum(idx):
                                stat[idx] = dat.CPG_NAN
                        group.create_dataset(name, data=stat, dtype=fun[1],
                                             compression='gzip')

            if annos:
                log.info('Adding annotations ...')
                group = in_group.create_group('annos')
                for name, anno in six.iteritems(annos):
                    group.create_dataset(name, data=anno[chunk_idx],
                                         dtype='int8',
                                         compression='gzip')
            """

            #chunk_file.close()

            flat_dict={}
            flatten_dict(yield_dict, flat_dict, no_prefix = True)
            yield flat_dict

    log.info('Done preprocessing!')