def count_bam_mapped(bam_file, tmpdir): # Counts number of reads in a BAM file WITHOUT iterating. Requires that the BAM is indexed if bam_file.startswith("s3"): local_bam_file = join(tmpdir, basename(bam_file)) download_file(bam_file, tmpdir, overwrite_ok=True) else: local_bam_file = bam_file return script(f""" samtools index {local_bam_file} samtools idxstats {local_bam_file} | grep -v '*' |cut -f3 | paste -sd+ | bc """)
def count_bigwig_total(bw_file, tmpdir): download_file(bw_file, tmpdir, overwrite_ok=True) bw_file = join(tmpdir, basename(bw_file)) from pyBigWig import open as open_bigwig bw = open_bigwig(bw_file) result = sum(l * bw.stats(ch, 0, l, "mean")[0] for ch, l in bw.chroms().items()) assert ( abs(result) > 0 ) ## BigWig could have negative values, e.g. the negative-strand GroCAP bigwigs return result
def load_hic( tmpdir, hic_file, hic_norm_file, hic_is_vc, hic_type, hic_resolution, tss_hic_contribution, window, min_window, gamma, interpolate_nan=True, apply_diagonal_bin_correction=True, ): print("Loading HiC") if hic_file.startswith("s3"): download_file(hic_file, tmpdir, overwrite_ok=True) hic_file = join(tmpdir, basename(hic_file)) if hic_norm_file.startswith("s3"): download_file(hic_norm_file, tmpdir, overwrite_ok=True) hic_norm_file = join(tmpdir, basename(hic_norm_file)) if hic_type == "juicebox": HiC_sparse_mat = hic_to_sparse(hic_file, hic_norm_file, hic_resolution) HiC = process_hic( hic_mat=HiC_sparse_mat, hic_norm_file=hic_norm_file, hic_is_vc=hic_is_vc, resolution=hic_resolution, tss_hic_contribution=tss_hic_contribution, window=window, min_window=min_window, gamma=gamma, interpolate_nan=interpolate_nan, apply_diagonal_bin_correction=apply_diagonal_bin_correction, ) # HiC = juicebox_to_bedpe(HiC, chromosome, args) elif hic_type == "bedpe": HiC = pd.read_csv( hic_file, sep="\t", names=[ "chr1", "x1", "x2", "chr2", "y1", "y2", "name", "hic_contact" ], ) return HiC
def load_enhancers( output_dir: str, tmpdir: str, candidate_enhancer_regions: str, chrom_sizes: str, features: dict, genes=None, force: bool = False, skip_rpkm_quantile: bool = False, celltype: str = None, tss_slop_for_class_assignment: int = 500, use_fast_count: bool = True, default_accessibility_feature: str = "", qnorm: str = None, class_override_file: str = None, ): makedirs(output_dir) makedirs(tmpdir) print("made dirs") local_candidate_enhancer_regions = join( tmpdir, basename(candidate_enhancer_regions)) download_file(candidate_enhancer_regions, tmpdir, overwrite_ok=True) enhancers = read_bed(local_candidate_enhancer_regions) enhancers = count_features_for_bed( enhancers, candidate_enhancer_regions, chrom_sizes, features, tmpdir, output_dir, "Enhancers", skip_rpkm_quantile, force, use_fast_count, ) return process_enhancer_list( enhancers, celltype, genes, tss_slop_for_class_assignment, qnorm, default_accessibility_feature, tmpdir, output_dir, )
def annotate_genes_with_features( genes: dict, chrom_sizes: str, output_dir: str, tmpdir: str, skip_gene_counts: bool = False, features: dict = {}, force: bool = False, use_fast_count: bool = True, default_accessibility_feature: str = "", ): # pull down a local copy of chromsizes, as that will be used in several locations downstream download_file(chrom_sizes, tmpdir, overwrite_ok=True) download_file(chrom_sizes + ".bed", tmpdir, overwrite_ok=True) chrom_sizes = join(tmpdir, basename(chrom_sizes)) # Setup files for counting bounds_bed = join(output_dir, "GeneList.bed") tss1kb = make_tss_region_file(genes, output_dir, tmpdir, chrom_sizes) # Count features over genes and promoters genes = count_features_for_bed( genes, bounds_bed, chrom_sizes, features, tmpdir, output_dir, "Genes", force=force, use_fast_count=use_fast_count, ) tsscounts = count_features_for_bed( tss1kb["df"], tss1kb["path"], chrom_sizes, features, tmpdir, output_dir, "Genes.TSS1kb", force=force, use_fast_count=use_fast_count, ) return merge_genes_tss(genes, tsscounts, tmpdir, output_dir, default_accessibility_feature)
def process_gene_bed( bed, tmpdir: str, name_cols: str = "symbol", main_name: str = "symbol", chrom_sizes: str = None, fail_on_nonunique: bool = True, ): # get local files from s3 makedirs(tmpdir) local_bed = join(tmpdir, basename(bed)) print(str(local_bed)) download_file(bed, tmpdir, overwrite_ok=True) if chrom_sizes is not None: local_chrom_sizes = join(tmpdir, basename(chrom_sizes)) print(str(local_chrom_sizes)) download_file(chrom_sizes, tmpdir, overwrite_ok=True) else: local_chrom_sizes = None bed = read_bed(local_bed) return process_gene_bed_filter(bed, name_cols, main_name, local_chrom_sizes, fail_on_nonunique)
def process_feature_outfile( feature_outfile, feature_bam, feature_name, tmpdir, df, orig_shape, skip_rpkm_quantile, ): if type(feature_outfile) == list: feature_outfile = feature_outfile[0] if type(feature_outfile) == File: feature_outfile = feature_outfile.path if feature_outfile.startswith("s3"): download_file(feature_outfile, tmpdir, overwrite_ok=True) feature_outfile = join(tmpdir, basename(feature_outfile)) domain_counts = pd.read_csv(feature_outfile, sep="\t", header=None).rename(columns={ 0: "chr", 1: "start", 2: "end" }) # read_bed(feature_outfile) score_column = domain_counts.columns[-1] total_counts = decode(count_total(feature_bam, tmpdir)) domain_counts = domain_counts[["chr", "start", "end", score_column]] featurecount = feature_name + ".readCount" domain_counts.rename(columns={score_column: featurecount}, inplace=True) domain_counts["chr"] = domain_counts["chr"].astype("str") df = df.merge(domain_counts.drop_duplicates()) # df = smart_merge(df, domain_counts.drop_duplicates()) assert df.shape[0] == orig_shape, "Dimension mismatch" return annotate_feature_quantiles(df, total_counts, feature_name, skip_rpkm_quantile, featurecount)
def process_genes( genes, tmpdir, expression_table_list, primary_gene_identifier, ue_file, celltype, output_dir, class_gene_file, ): genes[["chr", "start", "end", "name", "score", "strand"]].to_csv(join(tmpdir, "GeneList.bed"), sep="\t", index=False, header=False) # upload genes to outputdir upload_file(join(tmpdir, "GeneList.bed"), output_dir, overwrite_ok=True) if len(expression_table_list) > 0: # Add expression information names_list = [] print("Using gene expression from files: {} \n".format( expression_table_list)) for expression_table in expression_table_list: try: download_file(expression_table, tmpdir, overwrite_ok=True) name = basename(expression_table) local_expression_table = join(tmpdir, basename(expression_table)) expr = pd.read_table( local_expression_table, names=[primary_gene_identifier, name + ".Expression"], ) expr[name + ".Expression"] = expr[name + ".Expression"].astype(float) expr = expr.groupby(primary_gene_identifier).max() genes = genes.merge(expr, how="left", right_index=True, left_on="symbol") names_list.append(name + ".Expression") except Exception as e: print(e) traceback.print_exc() print("Failed on {}".format(expression_table)) genes["Expression"] = genes[names_list].mean(axis=1) genes["Expression.quantile"] = genes["Expression"].rank( method="average", na_option="top", ascending=True, pct=True) else: genes["Expression"] = np.NaN # Ubiquitously expressed annotation if ue_file is not None: download_file(ue_file, tmpdir, overwrite_ok=True) local_ue_file = join(tmpdir, basename(ue_file)) ubiq = pd.read_csv(local_ue_file, sep="\t") genes["is_ue"] = genes["name"].isin(ubiq.iloc[:, 0].values.tolist()) # cell type genes["celltype"] = celltype if class_gene_file is None: genes_for_class_assignment = genes else: genes_for_class_assignment = process_gene_bed( genes_for_class_assignment, gene_name_annotations, primary_gene_identifier, chrom_sizes, fail_on_nonunique=False, tmpdir=tmpdir, ) return { "genes": genes, "genes_for_class_assignment": genes_for_class_assignment }