Exemple #1
0
def _region_bed_sorted(bed_path, g, bed_sorted):
    chrom_sizes = parse_chrom_size(g)

    bed_df = pd.read_csv(bed_path, sep="\t", index_col=None, header=None)
    # select chroms that exist in g
    bed_df = bed_df.loc[bed_df.iloc[:, 0].isin(chrom_sizes.keys())]
    bed = BedTool.from_dataframe(bed_df)

    if bed_sorted:
        return bed
    else:
        return bed.sort(g=g)
Exemple #2
0
    def get_gene_promoter(self,
                          gene,
                          chrom_sizes_path,
                          slop=500,
                          transcript_featuretype='transcript'):
        tss_list = self.get_gene_tss(
            gene, transcript_featuretype=transcript_featuretype)

        chrom_sizes = parse_chrom_size(chrom_sizes_path)

        # gtf is 1 based
        promoter_list = []
        for tss in tss_list:
            promoter = copy(tss)

            promoter.start = max(promoter.start - slop, 0)
            promoter.end = max(promoter.end + slop,
                               chrom_sizes[promoter.chrom])

            promoter.featuretype = 'promoter'
            promoter_list.append(promoter)
        return promoter_list
Exemple #3
0
def dataframe_to_allc(table,
                      add_chr=False,
                      chrom=0,
                      pos=1,
                      strand=None,
                      context=None,
                      fasta_path=None,
                      chrom_sizes=None,
                      mc=None,
                      uc=None,
                      cov=None,
                      mc_frac=None,
                      pseudo_count=1,
                      num_upstream_bases=0,
                      num_downstream_bases=2):
    # change columns to int
    table.columns = list(range(table.shape[1]))

    # check genome coords
    if (chrom is None) or (pos is None):
        raise ValueError('Must provide chrom and pos')
    if (strand is None) or (context is None):
        if fasta_path is None:
            raise ValueError('Must provide fasta_path if strand or '
                             'context is None, need to use the genome '
                             'FASTA to parse strand or context.')
    if chrom_sizes is not None:
        chroms = set(parse_chrom_size(chrom_sizes).keys())
    elif fasta_path is not None:
        with pysam.FastaFile(fasta_path) as fasta:
            chroms = set(fasta.references)
    else:
        chroms = None

    # check allc values
    if (mc is not None) and (cov is not None):
        value_conversion_func = mode_mc_cov
        mode = 'mc+cov'
    elif (mc is not None) and (uc is not None):
        value_conversion_func = mode_mc_uc
        mode = 'mc+uc'
    elif (uc is not None) and (cov is not None):
        value_conversion_func = mode_uc_cov
        mode = 'uc+cov'
    elif (mc_frac is not None) and (cov is not None):
        value_conversion_func = mode_mc_frac_cov
        mode = 'mc_frac+cov'
    elif (mc_frac is not None) and (mc is not None):
        value_conversion_func = mode_mc_frac_mc
        mode = 'mc_frac+mc'
    elif (mc_frac is not None) and (uc is not None):
        value_conversion_func = mode_mc_frac_uc
        mode = 'mc_frac+uc'
    elif (mc_frac is not None) and (pseudo_count is not None):
        value_conversion_func = mode_mc_frac_pseudo_count
        mode = 'mc_frac+pseudo_count'
    else:
        modes = [
            'mc+cov', 'mc+uc', 'uc+cov', 'mc_frac+cov', 'mc_frac+mc',
            'mc_frac+uc', 'mc_frac+pseudo_count'
        ]
        raise ValueError(
            f'Need to provide one of these combinations in minimum: {modes}')
    # print(f'Using mode {mode} to get cytosine base counts.')

    # add chr to chrom names or not, user specify
    if add_chr:
        table[chrom] = 'chr' + table[chrom].astype(str)

    # select chroms
    if chroms is not None:
        table = table[table[chrom].isin(chroms)].copy()

    # get chrom, pos, strand, context, if needed, parse from fasta
    genome_coords = get_strand_and_context(
        table=table,
        chrom=chrom,
        pos=pos,
        strand=strand,
        context=context,
        fasta_path=fasta_path,
        num_upstream_bases=num_upstream_bases,
        num_downstream_bases=num_downstream_bases)

    # calculate mc, cov, frac (last col)
    if mode == 'mc+cov':
        values = value_conversion_func(table, mc, cov)
    elif mode == 'mc+uc':
        values = value_conversion_func(table, mc, uc)
    elif mode == 'uc+cov':
        values = value_conversion_func(table, uc, cov)
    elif mode == 'mc_frac+cov':
        values = value_conversion_func(table, mc_frac, cov)
    elif mode == 'mc_frac+mc':
        values = value_conversion_func(table, mc_frac, mc)
    elif mode == 'mc_frac+uc':
        values = value_conversion_func(table, mc_frac, uc)
    elif mode == 'mc_frac+pseudo_count':
        values = value_conversion_func(table, mc_frac, pseudo_count)
    else:
        # should have deal with error above
        raise ValueError

    # final allc table
    allc = pd.concat([genome_coords, values], axis=1)
    allc.columns = ['chrom', 'pos', 'strand', 'context', 'mc', 'cov', 'p']
    return allc
Exemple #4
0
def fragments_to_bigwig(output_prefix,
                        cluster,
                        chrom_size_path,
                        bw_bin_size=10,
                        scale=None):
    # concat bed
    subprocess.run(
        f'cat {output_prefix}_*_{cluster}.bed > {output_prefix}_{cluster}.bed && '
        f'rm -f {output_prefix}_*_{cluster}.bed',
        shell=True,
        check=True)
    # sort bed
    subprocess.run(
        f'sort -k 1,1 -k2,2n {output_prefix}_{cluster}.bed '
        f'> {output_prefix}_{cluster}.sorted.bed && '
        f'rm -f {output_prefix}_{cluster}.bed',
        shell=True,
        check=True)
    # bed to bedgraph
    subprocess.run(
        f'bedtools genomecov -i {output_prefix}_{cluster}.sorted.bed '
        f'-g {chrom_size_path} -bg > {output_prefix}_{cluster}.bedgraph',
        shell=True,
        check=True)
    bg_path = f'{output_prefix}_{cluster}.bedgraph'
    bw_path = f'{output_prefix}_{cluster}.bw'

    bg_iter = pd.read_csv(bg_path,
                          sep='\t',
                          header=None,
                          names=['chrom', 'start', 'end', 'count'],
                          chunksize=100000)
    total_wigs = []
    for bg in bg_iter:
        bg['start'] = bg['start'] // bw_bin_size
        wig_values = bg.groupby(['chrom',
                                 'start'])['count'].mean().reset_index()
        total_wigs.append(wig_values)
    total_wigs = pd.concat(total_wigs)
    total_wigs = total_wigs.groupby(['chrom',
                                     'start'])['count'].mean().reset_index()

    chrom_sizes = parse_chrom_size(chrom_size_path)
    chrom_sizes_list = [(k, v) for k, v in chrom_sizes.items()]

    with pyBigWig.open(bw_path, 'w') as bw_out:
        bw_out.addHeader(chrom_sizes_list)
        for chrom in chrom_sizes.keys():
            chrom_df = total_wigs[total_wigs['chrom'] == chrom].sort_values(
                'start')
            if chrom_df.shape[0] == 0:
                continue
            if scale is None:
                values = chrom_df['count'].astype(float)
            else:
                values = chrom_df['count'].astype(float) / scale
            bw_out.addEntries(chrom,
                              (chrom_df['start'] * bw_bin_size).tolist(),
                              values=values.tolist(),
                              span=bw_bin_size)

    # remove bed graph
    subprocess.run(f'rm -f {bg_path}', shell=True, check=True)
    # gzip bed
    subprocess.run(f'gzip {output_prefix}_{cluster}.sorted.bed',
                   shell=True,
                   check=True)
    return
Exemple #5
0
def get_fasta(bed_file_paths,
              fasta_path,
              output_path,
              slop_b=None,
              chrom_size_path=None,
              use_region_name=False,
              cpu=1,
              sort_mem_gbs=1,
              standard_length=None,
              merge=False,
              sample_region=None,
              seed=1):
    """
    Extract genome sequence fasta using bed files

    Parameters
    ----------
    bed_file_paths
    fasta_path
    output_path
    slop_b
    chrom_size_path
    use_region_name
        If region names provided in the fourth column of bed file:
            if True: use region name as seq name
            else: use chr:start-end as seq name
    cpu
    sort_mem_gbs
    standard_length
    merge
    sample_region
    seed

    Returns
    -------

    """
    chrom_dict = None
    if chrom_size_path is not None:
        chrom_dict = parse_chrom_size(chrom_size_path)

    if isinstance(bed_file_paths, str):
        bed_file_paths = [bed_file_paths]
    output_path = str(pathlib.Path(output_path).resolve())

    temp_bed = output_path + '.tmp_input.bed'
    with open(temp_bed, 'w') as temp_f:
        for bed_file_path in bed_file_paths:
            if str(bed_file_path).endswith('gz'):
                opener = open_gz
            else:
                opener = open
            with opener(bed_file_path) as f:
                if standard_length is None:
                    temp_f.write(f.read())
                else:
                    if chrom_dict is None:
                        raise ValueError(
                            'chrom_size_path can not be None when standard_length is not None'
                        )
                    half = int(standard_length / 2)
                    for line in f:
                        ll = line.strip().split('\t')
                        center = (int(ll[1]) + int(ll[2])) / 2
                        ll[1] = str(int(max(center - half, 0)))
                        ll[2] = str(int(min(center + half, chrom_dict[ll[0]])))
                        temp_f.write('\t'.join(ll) + '\n')

    sorted_temp = output_path + '.tmp_sorted.bed'
    try:
        subprocess.run(shlex.split(
            f'sort -k1,1 -k2,2n --parallel={cpu} -S {sort_mem_gbs}G {temp_bed} -o {sorted_temp}'
        ),
                       stderr=subprocess.PIPE,
                       stdout=subprocess.PIPE,
                       encoding='utf8',
                       check=True)
    except subprocess.CalledProcessError:
        # old sort version don't have parallel option
        print('run sort without --parallel')
        try:
            subprocess.run(shlex.split(
                f'sort -k1,1 -k2,2n -S {sort_mem_gbs}G {temp_bed} -o {sorted_temp}'
            ),
                           stderr=subprocess.PIPE,
                           stdout=subprocess.PIPE,
                           encoding='utf8',
                           check=True)
        except subprocess.CalledProcessError as e:
            print(e.stderr)
            raise e

    sorted_bed = BedTool(sorted_temp)
    if slop_b:
        if chrom_size_path is None:
            raise ValueError(
                'chrom_size_path can not be None when slop_b is not None')
        sorted_bed = sorted_bed.slop(b=slop_b, g=chrom_size_path)

    merged_temp = output_path + 'tmp_merge.bed'
    if merge:
        if use_region_name:
            print('can not use region name when merge is True')
            use_region_name = False
        sorted_bed.merge().moveto(merged_temp)
    else:
        sorted_bed.moveto(merged_temp)

    if sample_region is not None:
        bed_df = pd.read_csv(merged_temp, header=None, sep='\t')
        if sample_region <= bed_df.shape[0]:
            bed_df = bed_df.sample(sample_region, random_state=seed)
        bed_df.to_csv(merged_temp, sep='\t', index=None, header=None)

    name_option = '-name' if use_region_name else ''
    subprocess.run(shlex.split(
        f'bedtools getfasta -fi {fasta_path} -bed {merged_temp} -fo {output_path} {name_option}'
    ),
                   stderr=subprocess.PIPE,
                   stdout=subprocess.PIPE,
                   encoding='utf8',
                   check=True)

    subprocess.run(
        shlex.split(f'rm -f {temp_bed} {sorted_temp} {merged_temp}'))
    cleanup()
    return output_path
Exemple #6
0
def determine_datasets(regions, quantifiers, chrom_size_path, tmp_dir):
    tmp_dir = pathlib.Path(tmp_dir).absolute()
    tmp_dir.mkdir(exist_ok=True, parents=True)

    chrom_sizes = parse_chrom_size(chrom_size_path)
    datasets = {}
    for pair in regions:
        if len(pair) != 2:
            raise ValueError(
                f'Can not understand {pair} in regions parameter: {regions}')
        name, region_path = pair
        # prepare regions
        if isinstance(region_path, (str, pathlib.Path)) \
                and pathlib.Path(region_path).exists():
            region_bed_df = pd.read_csv(region_path, sep='\t', header=None)
            # remove additional chroms that do not occur in chrom_sizes
            region_bed_df = region_bed_df[region_bed_df.iloc[:, 0].isin(
                chrom_sizes)]
            # sort chroms
            region_bed_df = pybedtools.BedTool.from_dataframe(
                region_bed_df).sort(g=chrom_size_path).to_dataframe()

            if region_bed_df.shape[1] == 3:
                # add index
                print(
                    region_path,
                    'do not have index in its fourth column, adding it automatically. '
                    'If this is not desired, add a fourth column containing UNIQUE IDs to the BED file.'
                )
                region_bed_df[name] = (f'{name}_{i}'
                                       for i in range(region_bed_df.shape[0]))
            # check if name is unique()
            if region_bed_df.iloc[:, 3].duplicated().sum() > 0:
                raise ValueError(
                    f'Region IDs in {region_path} (fourth column) are not unique.'
                )
            # finally, set ID as index and only take the first three columns
            region_bed_df = region_bed_df.iloc[:, [0, 1, 2, 3]].set_index(
                region_bed_df.columns[3])
        else:
            try:
                # if region is int, generate chrom bins with bedtools
                region_size = int(region_path)
                region_bed_df = pybedtools.BedTool().makewindows(
                    g=chrom_size_path, w=region_size,
                    s=region_size).to_dataframe()

                # set region index
                _dfs = []
                for chrom, chrom_df in region_bed_df.groupby('chrom'):
                    chrom_df = chrom_df.reset_index(drop=True)
                    chrom_df.index = chrom_df.index.map(
                        lambda i: f'{chrom}_{i}')
                    _dfs.append(chrom_df)
                region_bed_df = pd.concat(_dfs)

            except ValueError:
                raise ValueError(
                    f'Can not understand region specification {region_path}')
        region_path = f'{tmp_dir}/{name}.regions.csv'
        region_bed_df.to_csv(region_path)
        datasets[name] = {'regions': region_path, 'quant': []}

    for quantifier in quantifiers:
        if len(quantifier) < 3:
            raise ValueError(
                f'Quantifier must have three parts, including '
                f'["NAME", "QUANT_TYPE", "MC_TYPE", "OTHER_KWARGS"], '
                f'where the "OTHER_KWARGS" are optional. '
                f'Got {quantifier}')
        name, quant_type, mc_types, *other_kwargs = quantifier
        if name not in datasets:
            raise KeyError(
                f'Name {name} occur in quantifiers, but not found in regions.')
        kwargs = {}
        for kv in other_kwargs:
            k, v = kv.split('=')
            try:
                kwargs[k] = float(v)
            except ValueError:
                kwargs[k] = v

        # prepare mc_types
        mc_types = [i.strip() for i in mc_types.split(',')]
        # prepare quant_types

        if quant_type not in ALLOW_QUANT_TYPES:
            raise ValueError(
                f'QUANT_TYPE need to be in {ALLOW_QUANT_TYPES}, got {quant_type} in {quantifier}.'
            )
        datasets[name]['quant'].append(
            Quant(mc_types=mc_types, quant_type=quant_type, kwargs=kwargs))
    return datasets