Example #1
0
    def main(self, name, opts):
        if opts.seed is not None:
            np.random.seed(opts.seed)

        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        # Check input arguments
        if not opts.cpg_profiles:
            if not (opts.pos_file or opts.dna_files):
                raise ValueError('Position table and DNA database expected!')

        if opts.dna_wlen and opts.dna_wlen % 2 == 0:
            raise '--dna_wlen must be odd!'
        if opts.cpg_wlen and opts.cpg_wlen % 2 != 0:
            raise '--cpg_wlen must be even!'

        # Parse functions for computing output statistics
        cpg_stats_meta = None
        win_stats_meta = None
        if opts.cpg_stats:
            cpg_stats_meta = get_stats_meta(opts.cpg_stats)
        if opts.win_stats:
            win_stats_meta = get_stats_meta(opts.win_stats)

        make_dir(opts.out_dir)
        outputs = OrderedDict()

        # Read single-cell profiles if provided
        if opts.cpg_profiles:
            log.info('Reading CpG profiles ...')
            outputs['cpg'] = read_cpg_profiles(
                opts.cpg_profiles,
                chromos=opts.chromos,
                nb_sample=opts.nb_sample,
                nb_sample_chromo=opts.nb_sample_chromo,
                log=log.info)

        # Create table with unique positions
        if opts.pos_file:
            # Read positions from file
            log.info('Reading position table ...')
            pos_table = pd.read_table(opts.pos_file,
                                      usecols=[0, 1],
                                      dtype={
                                          0: str,
                                          1: np.int32
                                      },
                                      header=None,
                                      comment='#')
            pos_table.columns = ['chromo', 'pos']
            pos_table['chromo'] = dat.format_chromo(pos_table['chromo'])
            pos_table = prepro_pos_table(pos_table)
        else:
            # Extract positions from profiles
            pos_tables = []
            for cpg_table in list(outputs['cpg'].values()):
                pos_tables.append(cpg_table[['chromo', 'pos']])
            pos_table = prepro_pos_table(pos_tables)

        if opts.chromos:
            pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)]
        if opts.nb_sample_chromo:
            pos_table = dat.sample_from_chromo(pos_table,
                                               opts.nb_sample_chromo)
        if opts.nb_sample:
            pos_table = pos_table.iloc[:opts.nb_sample]

        log.info('%d samples' % len(pos_table))

        make_dir(opts.out_dir)

        # Iterate over chromosomes
        # ------------------------
        for chromo in pos_table.chromo.unique():
            log.info('-' * 80)
            log.info('Chromosome %s ...' % (chromo))
            idx = pos_table.chromo == chromo
            chromo_pos = pos_table.loc[idx].pos.values
            chromo_outputs = OrderedDict()

            if 'cpg' in outputs:
                # Concatenate CpG tables into single nb_site x nb_output matrix
                chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo,
                                                       chromo_pos)
                chromo_outputs['cpg_mat'] = np.vstack(
                    list(chromo_outputs['cpg'].values())).T
                assert len(chromo_outputs['cpg_mat']) == len(chromo_pos)

            if 'cpg_mat' in chromo_outputs and opts.cpg_cov:
                cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1)
                idx = cov >= opts.cpg_cov
                tmp = '%s sites matched minimum coverage filter'
                tmp %= format_out_of(idx.sum(), len(idx))
                log.info(tmp)
                if idx.sum() == 0:
                    continue

                chromo_pos = chromo_pos[idx]
                chromo_outputs = select_dict(chromo_outputs, idx)

            # Read DNA of chromosome
            chromo_dna = None
            if opts.dna_files:
                chromo_dna = fasta.read_chromo(opts.dna_files, chromo)

            annos = None
            if opts.anno_files:
                log.info('Annotating CpG sites ...')
                annos = dict()
                for anno_file in opts.anno_files:
                    name = split_ext(anno_file)
                    annos[name] = annotate(anno_file, chromo, chromo_pos)

            # Iterate over chunks
            # -------------------
            nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size))
            for chunk in range(nb_chunk):
                log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk))
                chunk_start = chunk * opts.chunk_size
                chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size)
                chunk_idx = slice(chunk_start, chunk_end)
                chunk_pos = chromo_pos[chunk_idx]

                chunk_outputs = select_dict(chromo_outputs, chunk_idx)

                filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start,
                                                 chunk_end)
                filename = os.path.join(opts.out_dir, filename)
                chunk_file = h5.File(filename, 'w')

                # Write positions
                chunk_file.create_dataset('chromo',
                                          shape=(len(chunk_pos), ),
                                          dtype='S2')
                chunk_file['chromo'][:] = chromo.encode()
                chunk_file.create_dataset('pos',
                                          data=chunk_pos,
                                          dtype=np.int32)

                if len(chunk_outputs):
                    out_group = chunk_file.create_group('outputs')

                # Write cpg profiles
                if 'cpg' in chunk_outputs:
                    for name, value in six.iteritems(chunk_outputs['cpg']):
                        assert len(value) == len(chunk_pos)
                        # Round continuous values
                        out_group.create_dataset('cpg/%s' % name,
                                                 data=value.round(),
                                                 dtype=np.int8,
                                                 compression='gzip')
                    # Compute and write statistics
                    if cpg_stats_meta is not None:
                        log.info('Computing per CpG statistics ...')
                        cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'],
                                                      dat.CPG_NAN)
                        mask = np.sum(~cpg_mat.mask, axis=1)
                        mask = mask < opts.cpg_stats_cov
                        for name, fun in six.iteritems(cpg_stats_meta):
                            stat = fun[0](cpg_mat).data.astype(fun[1])
                            stat[mask] = dat.CPG_NAN
                            assert len(stat) == len(chunk_pos)
                            out_group.create_dataset('cpg_stats/%s' % name,
                                                     data=stat,
                                                     dtype=fun[1],
                                                     compression='gzip')

                # Write input features
                in_group = chunk_file.create_group('inputs')

                # DNA windows
                if chromo_dna:
                    log.info('Extracting DNA sequence windows ...')
                    dna_wins = extract_seq_windows(chromo_dna,
                                                   pos=chunk_pos,
                                                   wlen=opts.dna_wlen)
                    assert len(dna_wins) == len(chunk_pos)
                    in_group.create_dataset('dna',
                                            data=dna_wins,
                                            dtype=np.int8,
                                            compression='gzip')

                # CpG neighbors
                if opts.cpg_wlen:
                    log.info('Extracting CpG neighbors ...')
                    cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2)
                    context_group = in_group.create_group('cpg')
                    # outputs['cpg'], since neighboring CpG sites might lie
                    # outside chunk borders and un-mapped values are needed
                    for name, cpg_table in six.iteritems(outputs['cpg']):
                        cpg_table = cpg_table.loc[cpg_table.chromo == chromo]
                        state, dist = cpg_ext.extract(chunk_pos,
                                                      cpg_table.pos.values,
                                                      cpg_table.value.values)
                        nan = np.isnan(state)
                        state[nan] = dat.CPG_NAN
                        dist[nan] = dat.CPG_NAN
                        # States can be binary (np.int8) or continuous
                        # (np.float32).
                        state = state.astype(cpg_table.value.dtype, copy=False)
                        dist = dist.astype(np.float32, copy=False)

                        assert len(state) == len(chunk_pos)
                        assert len(dist) == len(chunk_pos)
                        assert np.all((dist > 0) | (dist == dat.CPG_NAN))

                        group = context_group.create_group(name)
                        group.create_dataset('state',
                                             data=state,
                                             compression='gzip')
                        group.create_dataset('dist',
                                             data=dist,
                                             compression='gzip')

                if win_stats_meta is not None and opts.cpg_wlen:
                    log.info('Computing window-based statistics ...')
                    states = []
                    dists = []
                    cpg_states = []
                    cpg_group = out_group['cpg']
                    context_group = in_group['cpg']
                    for output_name in six.iterkeys(cpg_group):
                        state = context_group[output_name]['state'].value
                        states.append(np.expand_dims(state, 2))
                        dist = context_group[output_name]['dist'].value
                        dists.append(np.expand_dims(dist, 2))
                        cpg_states.append(cpg_group[output_name].value)
                    # samples x outputs x cpg_wlen
                    states = np.swapaxes(np.concatenate(states, axis=2), 1, 2)
                    dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2)
                    cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2)
                    cpg_dists = np.zeros_like(cpg_states)
                    states = np.concatenate([states, cpg_states], axis=2)
                    dists = np.concatenate([dists, cpg_dists], axis=2)

                    for wlen in opts.win_stats_wlen:
                        idx = (states == dat.CPG_NAN) | (dists > wlen // 2)
                        states_wlen = np.ma.masked_array(states, idx)
                        group = out_group.create_group('win_stats/%d' % wlen)
                        for name, fun in six.iteritems(win_stats_meta):
                            stat = fun[0](states_wlen)
                            if hasattr(stat, 'mask'):
                                idx = stat.mask
                                stat = stat.data
                                if np.sum(idx):
                                    stat[idx] = dat.CPG_NAN
                            group.create_dataset(name,
                                                 data=stat,
                                                 dtype=fun[1],
                                                 compression='gzip')

                if annos:
                    log.info('Adding annotations ...')
                    group = in_group.create_group('annos')
                    for name, anno in six.iteritems(annos):
                        group.create_dataset(name,
                                             data=anno[chunk_idx],
                                             dtype='int8',
                                             compression='gzip')

                chunk_file.close()

        log.info('Done!')
        return 0
Example #2
0
    def main(self, name, opts):
        if opts.seed is not None:
            np.random.seed(opts.seed)

        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        # Check input arguments
        if not opts.cpg_profiles:
            if not (opts.pos_file or opts.dna_files):
                raise ValueError('Position table and DNA database expected!')

        if opts.dna_wlen and opts.dna_wlen % 2 == 0:
            raise '--dna_wlen must be odd!'
        if opts.cpg_wlen and opts.cpg_wlen % 2 != 0:
            raise '--cpg_wlen must be even!'

        # Parse functions for computing output statistics
        cpg_stats_meta = None
        win_stats_meta = None
        if opts.cpg_stats:
            cpg_stats_meta = get_stats_meta(opts.cpg_stats)
        if opts.win_stats:
            win_stats_meta = get_stats_meta(opts.win_stats)

        make_dir(opts.out_dir)
        outputs = OrderedDict()

        # Read single-cell profiles if provided
        if opts.cpg_profiles:
            log.info('Reading CpG profiles ...')
            outputs['cpg'] = read_cpg_profiles(
                opts.cpg_profiles,
                chromos=opts.chromos,
                nb_sample=opts.nb_sample,
                nb_sample_chromo=opts.nb_sample_chromo,
                log=log.info)

        # Create table with unique positions
        if opts.pos_file:  #the pos_file provide the CpG positions which need to be predicted
            # Read positions from file
            log.info('Reading position table ...')
            pos_table = pd.read_table(opts.pos_file,
                                      usecols=[0, 1],
                                      dtype={
                                          0: str,
                                          1: np.int32
                                      },
                                      header=None,
                                      comment='#')
            pos_table.columns = ['chromo', 'pos']
            pos_table['chromo'] = dat.format_chromo(pos_table['chromo'])
            pos_table = prepro_pos_table(pos_table)
        else:
            # Extract positions from profiles, if not provided. Predict position which available in at least one cells.
            pos_tables = []
            for cpg_table in list(outputs['cpg'].values()):
                pos_tables.append(cpg_table[['chromo', 'pos']])
            pos_table = prepro_pos_table(pos_tables)

        if opts.chromos:
            pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)]
        if opts.nb_sample_chromo:
            pos_table = dat.sample_from_chromo(pos_table,
                                               opts.nb_sample_chromo)
        if opts.nb_sample:
            pos_table = pos_table.iloc[:opts.nb_sample]

        log.info('%d samples' % len(pos_table))

        make_dir(opts.out_dir)

        # Iterate over chromosomes
        # ------------------------
        for chromo in pos_table.chromo.unique():
            log.info('-' * 80)
            log.info('Chromosome %s ...' % (chromo))
            idx = pos_table.chromo == chromo  ##idx is T/F for whether the entries are equal to the chromo
            chromo_pos = pos_table.loc[
                idx].pos.values  #a numpy array with 1D data
            chromo_outputs = OrderedDict()

            if 'cpg' in outputs:
                # Concatenate CpG tables into single nb_site x nb_output matrix
                chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo,
                                                       chromo_pos)
                #chromo_outputs, one array called 'cpg', 'cpg' has #sample array,
                #each item is mapped table of target_pos with value filled
                #OrderedDict([('BS27_1_SER', array([1, 1, 1, ..., 1, 1, 0], dtype=int8)),
                #('BS27_3_SER', array([-1,  1,  1, ...,  1, -1, -1], dtype=int8))])
                chromo_outputs['cpg_mat'] = np.vstack(
                    list(chromo_outputs['cpg'].values())).T
                #add one more array to it. np.vstack, stack array sequence vertically
                #chromo_outputs['cpg_mat'].shape=(402166, 2)
                #402166 is the CHR1 target pos number, 2 is the input two samples, BS27_1_SER, BS27_3_SER
                assert len(chromo_outputs['cpg_mat']) == len(chromo_pos)

            if 'cpg_mat' in chromo_outputs and opts.cpg_cov:
                cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1)
                assert np.all(cov >= 1)
                idx = cov >= opts.cpg_cov
                tmp = '%s sites matched minimum coverage filter'
                tmp %= format_out_of(idx.sum(), len(idx))
                log.info(tmp)
                if idx.sum() == 0:
                    continue

                chromo_pos = chromo_pos[idx]
                chromo_outputs = select_dict(chromo_outputs, idx)

            # Read DNA of chromosome
            chromo_dna = None
            if opts.dna_files:  #this will only read the corresponding chromosome sequence
                chromo_dna = fasta.read_chromo(
                    opts.dna_files,
                    chromo)  #chromo_dna is string, len=195471971 for chr1

            annos = None
            if opts.anno_files:
                log.info('Annotating CpG sites ...')
                annos = dict()
                for anno_file in opts.anno_files:
                    name = split_ext(anno_file)
                    annos[name] = annotate(anno_file, chromo, chromo_pos)

            # Iterate over chunks
            # -------------------
            nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size))
            for chunk in range(nb_chunk):
                log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk))
                chunk_start = chunk * opts.chunk_size
                chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size)
                chunk_idx = slice(chunk_start, chunk_end)
                chunk_pos = chromo_pos[chunk_idx]

                chunk_outputs = select_dict(chromo_outputs,
                                            chunk_idx)  #OrderedDict()
                #chunk_outputs is 1D array

                filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start,
                                                 chunk_end)
                filename = os.path.join(opts.out_dir, filename)
                chunk_file = h5.File(filename, 'w')

                # Write positions
                chunk_file.create_dataset(
                    'chromo', shape=(len(chunk_pos), ),
                    dtype='S2')  #create_dataset() in default for h5py
                chunk_file['chromo'][:] = chromo.encode(
                )  #set the chunk_file['chromo'] = 1 for all.
                #chunk_file['chromo'].shape = (32768,)
                chunk_file.create_dataset('pos',
                                          data=chunk_pos,
                                          dtype=np.int32)
                #chunk_file['pos'].shape = (32768,) # the size is default chunk_size

                if len(chunk_outputs):  #len(chunk_outputs)=2
                    out_group = chunk_file.create_group('outputs')
                    #for now, type(out_group) = <class 'h5py._hl.group.Group'>
                    #list(out_group) = []

                # Write cpg profiles
                if 'cpg' in chunk_outputs:
                    for name, value in six.iteritems(chunk_outputs['cpg']):
                        #name = ["BS27_1_SER", 'BS27_3_SER'] # the sample name
                        #value= 2 numpy array, both with shape=(32768,)
                        assert len(value) == len(chunk_pos)
                        # Round continuous values
                        out_group.create_dataset('cpg/%s' % name,
                                                 data=value.round(),
                                                 dtype=np.int8,
                                                 compression='gzip')
                        #type(out_group)= <class 'h5py._hl.group.Group'>
                        #list(out_group) = ['cpg']
                        #list(out_group['cpg']) = ['BS27_1_SER', 'BS27_3_SER']

                    # Compute and write statistics
                    if cpg_stats_meta is not None:
                        log.info('Computing per CpG statistics ...')
                        cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'],
                                                      dat.CPG_NAN)
                        #cpg_mat.shape=(32768, 2)
                        mask = np.sum(~cpg_mat.mask, axis=1)
                        mask = mask < opts.cpg_stats_cov
                        for name, fun in six.iteritems(cpg_stats_meta):
                            stat = fun[0](cpg_mat).data.astype(fun[1])
                            stat[mask] = dat.CPG_NAN
                            assert len(stat) == len(chunk_pos)
                            out_group.create_dataset('cpg_stats/%s' % name,
                                                     data=stat,
                                                     dtype=fun[1],
                                                     compression='gzip')

#until here:


#>>> chunk_file.visit(printname)
#chromo
#outputs
#outputs/cpg
#outputs/cpg/BS27_1_SER
#utputs/cpg/BS27_3_SER
#pos

# Write input features
                in_group = chunk_file.create_group('inputs')

                # DNA windows
                if chromo_dna:
                    log.info('Extracting DNA sequence windows ...')
                    dna_wins = extract_seq_windows(chromo_dna,
                                                   pos=chunk_pos,
                                                   wlen=opts.dna_wlen)
                    #give the fasta sequence of one chromosome ('chromo_dna'), and targeted position ('chunk_pos')
                    #, and wlen=1001, return a numpy array with shape as (32768, 1001). The array has been transfered as
                    #number rather than base pair
                    assert len(dna_wins) == len(chunk_pos)
                    in_group.create_dataset('dna',
                                            data=dna_wins,
                                            dtype=np.int8,
                                            compression='gzip')
                    #>>> in_group.visit(printname) = dna

                # CpG neighbors
                if opts.cpg_wlen:
                    log.info('Extracting CpG neighbors ...')
                    cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2)
                    context_group = in_group.create_group('cpg')
                    # outputs['cpg'], since neighboring CpG sites might lie
                    # outside chunk borders and un-mapped values are needed
                    for name, cpg_table in six.iteritems(outputs['cpg']):
                        #name="BS27_1_SER" and "BS27_3_SER"
                        #cpg_table = numpy array, with three columns information for each input sample.
                        cpg_table = cpg_table.loc[cpg_table.chromo == chromo]
                        state, dist = cpg_ext.extract(
                            chunk_pos, cpg_table.pos.values,
                            cpg_table.value.values
                        )  #extract the cpg distance and state with wlen
                        nan = np.isnan(state)
                        state[
                            nan] = dat.CPG_NAN  #set nan value as -1, which means unknown
                        dist[nan] = dat.CPG_NAN
                        # States can be binary (np.int8) or continuous
                        # (np.float32).
                        state = state.astype(cpg_table.value.dtype,
                                             copy=False)  #set data type
                        dist = dist.astype(np.float32, copy=False)

                        assert len(state) == len(chunk_pos)
                        assert len(dist) == len(chunk_pos)
                        assert np.all((dist > 0) | (dist == dat.CPG_NAN))

                        group = context_group.create_group(name)
                        group.create_dataset('state',
                                             data=state,
                                             compression='gzip')
                        group.create_dataset('dist',
                                             data=dist,
                                             compression='gzip')
                        #list(group) = ['state','dist']

                if win_stats_meta is not None and opts.cpg_wlen:
                    log.info('Computing window-based statistics ...')
                    states = []
                    dists = []
                    cpg_states = []
                    cpg_group = out_group['cpg']
                    context_group = in_group['cpg']
                    for output_name in six.iterkeys(cpg_group):
                        state = context_group[output_name]['state'].value
                        states.append(np.expand_dims(state, 2))
                        dist = context_group[output_name]['dist'].value
                        dists.append(np.expand_dims(dist, 2))
                        cpg_states.append(cpg_group[output_name].value)
                    # samples x outputs x cpg_wlen
                    states = np.swapaxes(np.concatenate(states, axis=2), 1, 2)
                    dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2)
                    cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2)
                    cpg_dists = np.zeros_like(cpg_states)
                    states = np.concatenate([states, cpg_states], axis=2)
                    dists = np.concatenate([dists, cpg_dists], axis=2)

                    for wlen in opts.win_stats_wlen:
                        idx = (states == dat.CPG_NAN) | (dists > wlen // 2)
                        states_wlen = np.ma.masked_array(states, idx)
                        group = out_group.create_group('win_stats/%d' % wlen)
                        for name, fun in six.iteritems(win_stats_meta):
                            stat = fun[0](states_wlen)
                            if hasattr(stat, 'mask'):
                                idx = stat.mask
                                stat = stat.data
                                if np.sum(idx):
                                    stat[idx] = dat.CPG_NAN
                            group.create_dataset(name,
                                                 data=stat,
                                                 dtype=fun[1],
                                                 compression='gzip')

                if annos:
                    log.info('Adding annotations ...')
                    group = in_group.create_group('annos')
                    for name, anno in six.iteritems(annos):
                        group.create_dataset(name,
                                             data=anno[chunk_idx],
                                             dtype='int8',
                                             compression='gzip')

                chunk_file.close()

        log.info('Done!')
        return 0
Example #3
0
    def main(self, name, opts):
        if opts.seed is not None:
            np.random.seed(opts.seed)

        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        # Check input arguments
        if not opts.cpg_profiles:
            if not (opts.pos_file or opts.dna_files):
                raise ValueError('Position table and DNA database expected!')

        if opts.dna_wlen and opts.dna_wlen % 2 == 0:
            raise '--dna_wlen must be odd!'
        if opts.cpg_wlen and opts.cpg_wlen % 2 != 0:
            raise '--cpg_wlen must be even!'

        # Parse functions for computing output statistics
        cpg_stats_meta = None
        win_stats_meta = None
        if opts.cpg_stats:
            cpg_stats_meta = get_stats_meta(opts.cpg_stats)
        if opts.win_stats:
            win_stats_meta = get_stats_meta(opts.win_stats)

        make_dir(opts.out_dir)
        outputs = OrderedDict()

        # Read single-cell profiles if provided
        if opts.cpg_profiles:
            log.info('Reading CpG profiles ...')
            outputs['cpg'] = read_cpg_profiles(
                opts.cpg_profiles,
                chromos=opts.chromos,
                nb_sample=opts.nb_sample,
                nb_sample_chromo=opts.nb_sample_chromo,
                log=log.info)

        # Create table with unique positions
        if opts.pos_file:
            # Read positions from file
            log.info('Reading position table ...')
            pos_table = pd.read_table(opts.pos_file, usecols=[0, 1],
                                      dtype={0: str, 1: np.int32},
                                      header=None, comment='#')
            pos_table.columns = ['chromo', 'pos']
            pos_table['chromo'] = dat.format_chromo(pos_table['chromo'])
            pos_table = prepro_pos_table(pos_table)
        else:
            # Extract positions from profiles
            pos_tables = []
            for cpg_table in list(outputs['cpg'].values()):
                pos_tables.append(cpg_table[['chromo', 'pos']])
            pos_table = prepro_pos_table(pos_tables)

        if opts.chromos:
            pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)]
        if opts.nb_sample_chromo:
            pos_table = dat.sample_from_chromo(pos_table, opts.nb_sample_chromo)
        if opts.nb_sample:
            pos_table = pos_table.iloc[:opts.nb_sample]

        log.info('%d samples' % len(pos_table))

        make_dir(opts.out_dir)

        # Iterate over chromosomes
        # ------------------------
        for chromo in pos_table.chromo.unique():
            log.info('-' * 80)
            log.info('Chromosome %s ...' % (chromo))
            idx = pos_table.chromo == chromo
            chromo_pos = pos_table.loc[idx].pos.values
            chromo_outputs = OrderedDict()

            if 'cpg' in outputs:
                # Concatenate CpG tables into single nb_site x nb_output matrix
                chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'],
                                                       chromo, chromo_pos)
                chromo_outputs['cpg_mat'] = np.vstack(
                    list(chromo_outputs['cpg'].values())).T
                assert len(chromo_outputs['cpg_mat']) == len(chromo_pos)

            if 'cpg_mat' in chromo_outputs and opts.cpg_cov:
                cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1)
                assert np.all(cov >= 1)
                idx = cov >= opts.cpg_cov
                tmp = '%s sites matched minimum coverage filter'
                tmp %= format_out_of(idx.sum(), len(idx))
                log.info(tmp)
                if idx.sum() == 0:
                    continue

                chromo_pos = chromo_pos[idx]
                chromo_outputs = select_dict(chromo_outputs, idx)

            # Read DNA of chromosome
            chromo_dna = None
            if opts.dna_files:
                chromo_dna = fasta.read_chromo(opts.dna_files, chromo)

            annos = None
            if opts.anno_files:
                log.info('Annotating CpG sites ...')
                annos = dict()
                for anno_file in opts.anno_files:
                    name = split_ext(anno_file)
                    annos[name] = annotate(anno_file, chromo, chromo_pos)

            # Iterate over chunks
            # -------------------
            nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size))
            for chunk in range(nb_chunk):
                log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk))
                chunk_start = chunk * opts.chunk_size
                chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size)
                chunk_idx = slice(chunk_start, chunk_end)
                chunk_pos = chromo_pos[chunk_idx]

                chunk_outputs = select_dict(chromo_outputs, chunk_idx)

                filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start, chunk_end)
                filename = os.path.join(opts.out_dir, filename)
                chunk_file = h5.File(filename, 'w')

                # Write positions
                chunk_file.create_dataset('chromo', shape=(len(chunk_pos),),
                                          dtype='S2')
                chunk_file['chromo'][:] = chromo.encode()
                chunk_file.create_dataset('pos', data=chunk_pos, dtype=np.int32)

                if len(chunk_outputs):
                    out_group = chunk_file.create_group('outputs')

                # Write cpg profiles
                if 'cpg' in chunk_outputs:
                    for name, value in six.iteritems(chunk_outputs['cpg']):
                        assert len(value) == len(chunk_pos)
                        # Round continuous values
                        out_group.create_dataset('cpg/%s' % name,
                                                 data=value.round(),
                                                 dtype=np.int8,
                                                 compression='gzip')
                    # Compute and write statistics
                    if cpg_stats_meta is not None:
                        log.info('Computing per CpG statistics ...')
                        cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'],
                                                      dat.CPG_NAN)
                        mask = np.sum(~cpg_mat.mask, axis=1)
                        mask = mask < opts.cpg_stats_cov
                        for name, fun in six.iteritems(cpg_stats_meta):
                            stat = fun[0](cpg_mat).data.astype(fun[1])
                            stat[mask] = dat.CPG_NAN
                            assert len(stat) == len(chunk_pos)
                            out_group.create_dataset('cpg_stats/%s' % name,
                                                     data=stat,
                                                     dtype=fun[1],
                                                     compression='gzip')

                # Write input features
                in_group = chunk_file.create_group('inputs')

                # DNA windows
                if chromo_dna:
                    log.info('Extracting DNA sequence windows ...')
                    dna_wins = extract_seq_windows(chromo_dna, pos=chunk_pos,
                                                   wlen=opts.dna_wlen)
                    assert len(dna_wins) == len(chunk_pos)
                    in_group.create_dataset('dna', data=dna_wins, dtype=np.int8,
                                            compression='gzip')

                # CpG neighbors
                if opts.cpg_wlen:
                    log.info('Extracting CpG neighbors ...')
                    cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2)
                    context_group = in_group.create_group('cpg')
                    # outputs['cpg'], since neighboring CpG sites might lie
                    # outside chunk borders and un-mapped values are needed
                    for name, cpg_table in six.iteritems(outputs['cpg']):
                        cpg_table = cpg_table.loc[cpg_table.chromo == chromo]
                        state, dist = cpg_ext.extract(chunk_pos,
                                                      cpg_table.pos.values,
                                                      cpg_table.value.values)
                        nan = np.isnan(state)
                        state[nan] = dat.CPG_NAN
                        dist[nan] = dat.CPG_NAN
                        # States can be binary (np.int8) or continuous
                        # (np.float32).
                        state = state.astype(cpg_table.value.dtype, copy=False)
                        dist = dist.astype(np.float32, copy=False)

                        assert len(state) == len(chunk_pos)
                        assert len(dist) == len(chunk_pos)
                        assert np.all((dist > 0) | (dist == dat.CPG_NAN))

                        group = context_group.create_group(name)
                        group.create_dataset('state', data=state,
                                             compression='gzip')
                        group.create_dataset('dist', data=dist,
                                             compression='gzip')

                if win_stats_meta is not None and opts.cpg_wlen:
                    log.info('Computing window-based statistics ...')
                    states = []
                    dists = []
                    cpg_states = []
                    cpg_group = out_group['cpg']
                    context_group = in_group['cpg']
                    for output_name in six.iterkeys(cpg_group):
                        state = context_group[output_name]['state'].value
                        states.append(np.expand_dims(state, 2))
                        dist = context_group[output_name]['dist'].value
                        dists.append(np.expand_dims(dist, 2))
                        cpg_states.append(cpg_group[output_name].value)
                    # samples x outputs x cpg_wlen
                    states = np.swapaxes(np.concatenate(states, axis=2), 1, 2)
                    dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2)
                    cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2)
                    cpg_dists = np.zeros_like(cpg_states)
                    states = np.concatenate([states, cpg_states], axis=2)
                    dists = np.concatenate([dists, cpg_dists], axis=2)

                    for wlen in opts.win_stats_wlen:
                        idx = (states == dat.CPG_NAN) | (dists > wlen // 2)
                        states_wlen = np.ma.masked_array(states, idx)
                        group = out_group.create_group('win_stats/%d' % wlen)
                        for name, fun in six.iteritems(win_stats_meta):
                            stat = fun[0](states_wlen)
                            if hasattr(stat, 'mask'):
                                idx = stat.mask
                                stat = stat.data
                                if np.sum(idx):
                                    stat[idx] = dat.CPG_NAN
                            group.create_dataset(name, data=stat, dtype=fun[1],
                                                 compression='gzip')

                if annos:
                    log.info('Adding annotations ...')
                    group = in_group.create_group('annos')
                    for name, anno in six.iteritems(annos):
                        group.create_dataset(name, data=anno[chunk_idx],
                                             dtype='int8',
                                             compression='gzip')

                chunk_file.close()

        log.info('Done!')
        return 0