def _region_bed_sorted(bed_path, g, bed_sorted): chrom_sizes = parse_chrom_size(g) bed_df = pd.read_csv(bed_path, sep="\t", index_col=None, header=None) # select chroms that exist in g bed_df = bed_df.loc[bed_df.iloc[:, 0].isin(chrom_sizes.keys())] bed = BedTool.from_dataframe(bed_df) if bed_sorted: return bed else: return bed.sort(g=g)
def get_gene_promoter(self, gene, chrom_sizes_path, slop=500, transcript_featuretype='transcript'): tss_list = self.get_gene_tss( gene, transcript_featuretype=transcript_featuretype) chrom_sizes = parse_chrom_size(chrom_sizes_path) # gtf is 1 based promoter_list = [] for tss in tss_list: promoter = copy(tss) promoter.start = max(promoter.start - slop, 0) promoter.end = max(promoter.end + slop, chrom_sizes[promoter.chrom]) promoter.featuretype = 'promoter' promoter_list.append(promoter) return promoter_list
def dataframe_to_allc(table, add_chr=False, chrom=0, pos=1, strand=None, context=None, fasta_path=None, chrom_sizes=None, mc=None, uc=None, cov=None, mc_frac=None, pseudo_count=1, num_upstream_bases=0, num_downstream_bases=2): # change columns to int table.columns = list(range(table.shape[1])) # check genome coords if (chrom is None) or (pos is None): raise ValueError('Must provide chrom and pos') if (strand is None) or (context is None): if fasta_path is None: raise ValueError('Must provide fasta_path if strand or ' 'context is None, need to use the genome ' 'FASTA to parse strand or context.') if chrom_sizes is not None: chroms = set(parse_chrom_size(chrom_sizes).keys()) elif fasta_path is not None: with pysam.FastaFile(fasta_path) as fasta: chroms = set(fasta.references) else: chroms = None # check allc values if (mc is not None) and (cov is not None): value_conversion_func = mode_mc_cov mode = 'mc+cov' elif (mc is not None) and (uc is not None): value_conversion_func = mode_mc_uc mode = 'mc+uc' elif (uc is not None) and (cov is not None): value_conversion_func = mode_uc_cov mode = 'uc+cov' elif (mc_frac is not None) and (cov is not None): value_conversion_func = mode_mc_frac_cov mode = 'mc_frac+cov' elif (mc_frac is not None) and (mc is not None): value_conversion_func = mode_mc_frac_mc mode = 'mc_frac+mc' elif (mc_frac is not None) and (uc is not None): value_conversion_func = mode_mc_frac_uc mode = 'mc_frac+uc' elif (mc_frac is not None) and (pseudo_count is not None): value_conversion_func = mode_mc_frac_pseudo_count mode = 'mc_frac+pseudo_count' else: modes = [ 'mc+cov', 'mc+uc', 'uc+cov', 'mc_frac+cov', 'mc_frac+mc', 'mc_frac+uc', 'mc_frac+pseudo_count' ] raise ValueError( f'Need to provide one of these combinations in minimum: {modes}') # print(f'Using mode {mode} to get cytosine base counts.') # add chr to chrom names or not, user specify if add_chr: table[chrom] = 'chr' + table[chrom].astype(str) # select chroms if chroms is not None: table = table[table[chrom].isin(chroms)].copy() # get chrom, pos, strand, context, if needed, parse from fasta genome_coords = get_strand_and_context( table=table, chrom=chrom, pos=pos, strand=strand, context=context, fasta_path=fasta_path, num_upstream_bases=num_upstream_bases, num_downstream_bases=num_downstream_bases) # calculate mc, cov, frac (last col) if mode == 'mc+cov': values = value_conversion_func(table, mc, cov) elif mode == 'mc+uc': values = value_conversion_func(table, mc, uc) elif mode == 'uc+cov': values = value_conversion_func(table, uc, cov) elif mode == 'mc_frac+cov': values = value_conversion_func(table, mc_frac, cov) elif mode == 'mc_frac+mc': values = value_conversion_func(table, mc_frac, mc) elif mode == 'mc_frac+uc': values = value_conversion_func(table, mc_frac, uc) elif mode == 'mc_frac+pseudo_count': values = value_conversion_func(table, mc_frac, pseudo_count) else: # should have deal with error above raise ValueError # final allc table allc = pd.concat([genome_coords, values], axis=1) allc.columns = ['chrom', 'pos', 'strand', 'context', 'mc', 'cov', 'p'] return allc
def fragments_to_bigwig(output_prefix, cluster, chrom_size_path, bw_bin_size=10, scale=None): # concat bed subprocess.run( f'cat {output_prefix}_*_{cluster}.bed > {output_prefix}_{cluster}.bed && ' f'rm -f {output_prefix}_*_{cluster}.bed', shell=True, check=True) # sort bed subprocess.run( f'sort -k 1,1 -k2,2n {output_prefix}_{cluster}.bed ' f'> {output_prefix}_{cluster}.sorted.bed && ' f'rm -f {output_prefix}_{cluster}.bed', shell=True, check=True) # bed to bedgraph subprocess.run( f'bedtools genomecov -i {output_prefix}_{cluster}.sorted.bed ' f'-g {chrom_size_path} -bg > {output_prefix}_{cluster}.bedgraph', shell=True, check=True) bg_path = f'{output_prefix}_{cluster}.bedgraph' bw_path = f'{output_prefix}_{cluster}.bw' bg_iter = pd.read_csv(bg_path, sep='\t', header=None, names=['chrom', 'start', 'end', 'count'], chunksize=100000) total_wigs = [] for bg in bg_iter: bg['start'] = bg['start'] // bw_bin_size wig_values = bg.groupby(['chrom', 'start'])['count'].mean().reset_index() total_wigs.append(wig_values) total_wigs = pd.concat(total_wigs) total_wigs = total_wigs.groupby(['chrom', 'start'])['count'].mean().reset_index() chrom_sizes = parse_chrom_size(chrom_size_path) chrom_sizes_list = [(k, v) for k, v in chrom_sizes.items()] with pyBigWig.open(bw_path, 'w') as bw_out: bw_out.addHeader(chrom_sizes_list) for chrom in chrom_sizes.keys(): chrom_df = total_wigs[total_wigs['chrom'] == chrom].sort_values( 'start') if chrom_df.shape[0] == 0: continue if scale is None: values = chrom_df['count'].astype(float) else: values = chrom_df['count'].astype(float) / scale bw_out.addEntries(chrom, (chrom_df['start'] * bw_bin_size).tolist(), values=values.tolist(), span=bw_bin_size) # remove bed graph subprocess.run(f'rm -f {bg_path}', shell=True, check=True) # gzip bed subprocess.run(f'gzip {output_prefix}_{cluster}.sorted.bed', shell=True, check=True) return
def get_fasta(bed_file_paths, fasta_path, output_path, slop_b=None, chrom_size_path=None, use_region_name=False, cpu=1, sort_mem_gbs=1, standard_length=None, merge=False, sample_region=None, seed=1): """ Extract genome sequence fasta using bed files Parameters ---------- bed_file_paths fasta_path output_path slop_b chrom_size_path use_region_name If region names provided in the fourth column of bed file: if True: use region name as seq name else: use chr:start-end as seq name cpu sort_mem_gbs standard_length merge sample_region seed Returns ------- """ chrom_dict = None if chrom_size_path is not None: chrom_dict = parse_chrom_size(chrom_size_path) if isinstance(bed_file_paths, str): bed_file_paths = [bed_file_paths] output_path = str(pathlib.Path(output_path).resolve()) temp_bed = output_path + '.tmp_input.bed' with open(temp_bed, 'w') as temp_f: for bed_file_path in bed_file_paths: if str(bed_file_path).endswith('gz'): opener = open_gz else: opener = open with opener(bed_file_path) as f: if standard_length is None: temp_f.write(f.read()) else: if chrom_dict is None: raise ValueError( 'chrom_size_path can not be None when standard_length is not None' ) half = int(standard_length / 2) for line in f: ll = line.strip().split('\t') center = (int(ll[1]) + int(ll[2])) / 2 ll[1] = str(int(max(center - half, 0))) ll[2] = str(int(min(center + half, chrom_dict[ll[0]]))) temp_f.write('\t'.join(ll) + '\n') sorted_temp = output_path + '.tmp_sorted.bed' try: subprocess.run(shlex.split( f'sort -k1,1 -k2,2n --parallel={cpu} -S {sort_mem_gbs}G {temp_bed} -o {sorted_temp}' ), stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf8', check=True) except subprocess.CalledProcessError: # old sort version don't have parallel option print('run sort without --parallel') try: subprocess.run(shlex.split( f'sort -k1,1 -k2,2n -S {sort_mem_gbs}G {temp_bed} -o {sorted_temp}' ), stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf8', check=True) except subprocess.CalledProcessError as e: print(e.stderr) raise e sorted_bed = BedTool(sorted_temp) if slop_b: if chrom_size_path is None: raise ValueError( 'chrom_size_path can not be None when slop_b is not None') sorted_bed = sorted_bed.slop(b=slop_b, g=chrom_size_path) merged_temp = output_path + 'tmp_merge.bed' if merge: if use_region_name: print('can not use region name when merge is True') use_region_name = False sorted_bed.merge().moveto(merged_temp) else: sorted_bed.moveto(merged_temp) if sample_region is not None: bed_df = pd.read_csv(merged_temp, header=None, sep='\t') if sample_region <= bed_df.shape[0]: bed_df = bed_df.sample(sample_region, random_state=seed) bed_df.to_csv(merged_temp, sep='\t', index=None, header=None) name_option = '-name' if use_region_name else '' subprocess.run(shlex.split( f'bedtools getfasta -fi {fasta_path} -bed {merged_temp} -fo {output_path} {name_option}' ), stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf8', check=True) subprocess.run( shlex.split(f'rm -f {temp_bed} {sorted_temp} {merged_temp}')) cleanup() return output_path
def determine_datasets(regions, quantifiers, chrom_size_path, tmp_dir): tmp_dir = pathlib.Path(tmp_dir).absolute() tmp_dir.mkdir(exist_ok=True, parents=True) chrom_sizes = parse_chrom_size(chrom_size_path) datasets = {} for pair in regions: if len(pair) != 2: raise ValueError( f'Can not understand {pair} in regions parameter: {regions}') name, region_path = pair # prepare regions if isinstance(region_path, (str, pathlib.Path)) \ and pathlib.Path(region_path).exists(): region_bed_df = pd.read_csv(region_path, sep='\t', header=None) # remove additional chroms that do not occur in chrom_sizes region_bed_df = region_bed_df[region_bed_df.iloc[:, 0].isin( chrom_sizes)] # sort chroms region_bed_df = pybedtools.BedTool.from_dataframe( region_bed_df).sort(g=chrom_size_path).to_dataframe() if region_bed_df.shape[1] == 3: # add index print( region_path, 'do not have index in its fourth column, adding it automatically. ' 'If this is not desired, add a fourth column containing UNIQUE IDs to the BED file.' ) region_bed_df[name] = (f'{name}_{i}' for i in range(region_bed_df.shape[0])) # check if name is unique() if region_bed_df.iloc[:, 3].duplicated().sum() > 0: raise ValueError( f'Region IDs in {region_path} (fourth column) are not unique.' ) # finally, set ID as index and only take the first three columns region_bed_df = region_bed_df.iloc[:, [0, 1, 2, 3]].set_index( region_bed_df.columns[3]) else: try: # if region is int, generate chrom bins with bedtools region_size = int(region_path) region_bed_df = pybedtools.BedTool().makewindows( g=chrom_size_path, w=region_size, s=region_size).to_dataframe() # set region index _dfs = [] for chrom, chrom_df in region_bed_df.groupby('chrom'): chrom_df = chrom_df.reset_index(drop=True) chrom_df.index = chrom_df.index.map( lambda i: f'{chrom}_{i}') _dfs.append(chrom_df) region_bed_df = pd.concat(_dfs) except ValueError: raise ValueError( f'Can not understand region specification {region_path}') region_path = f'{tmp_dir}/{name}.regions.csv' region_bed_df.to_csv(region_path) datasets[name] = {'regions': region_path, 'quant': []} for quantifier in quantifiers: if len(quantifier) < 3: raise ValueError( f'Quantifier must have three parts, including ' f'["NAME", "QUANT_TYPE", "MC_TYPE", "OTHER_KWARGS"], ' f'where the "OTHER_KWARGS" are optional. ' f'Got {quantifier}') name, quant_type, mc_types, *other_kwargs = quantifier if name not in datasets: raise KeyError( f'Name {name} occur in quantifiers, but not found in regions.') kwargs = {} for kv in other_kwargs: k, v = kv.split('=') try: kwargs[k] = float(v) except ValueError: kwargs[k] = v # prepare mc_types mc_types = [i.strip() for i in mc_types.split(',')] # prepare quant_types if quant_type not in ALLOW_QUANT_TYPES: raise ValueError( f'QUANT_TYPE need to be in {ALLOW_QUANT_TYPES}, got {quant_type} in {quantifier}.' ) datasets[name]['quant'].append( Quant(mc_types=mc_types, quant_type=quant_type, kwargs=kwargs)) return datasets