Exemple #1
0
def count_bam_mapped(bam_file, tmpdir):
    # Counts number of reads in a BAM file WITHOUT iterating.  Requires that the BAM is indexed
    if bam_file.startswith("s3"):
        local_bam_file = join(tmpdir, basename(bam_file))
        download_file(bam_file, tmpdir, overwrite_ok=True)
    else:
        local_bam_file = bam_file
    return script(f"""
    samtools index {local_bam_file}
    samtools idxstats {local_bam_file} | grep -v '*' |cut -f3 | paste -sd+ | bc 
    """)
Exemple #2
0
def count_bigwig_total(bw_file, tmpdir):
    download_file(bw_file, tmpdir, overwrite_ok=True)
    bw_file = join(tmpdir, basename(bw_file))
    from pyBigWig import open as open_bigwig

    bw = open_bigwig(bw_file)
    result = sum(l * bw.stats(ch, 0, l, "mean")[0]
                 for ch, l in bw.chroms().items())
    assert (
        abs(result) > 0
    )  ## BigWig could have negative values, e.g. the negative-strand GroCAP bigwigs
    return result
Exemple #3
0
def load_hic(
    tmpdir,
    hic_file,
    hic_norm_file,
    hic_is_vc,
    hic_type,
    hic_resolution,
    tss_hic_contribution,
    window,
    min_window,
    gamma,
    interpolate_nan=True,
    apply_diagonal_bin_correction=True,
):
    print("Loading HiC")
    if hic_file.startswith("s3"):
        download_file(hic_file, tmpdir, overwrite_ok=True)
        hic_file = join(tmpdir, basename(hic_file))
    if hic_norm_file.startswith("s3"):
        download_file(hic_norm_file, tmpdir, overwrite_ok=True)
        hic_norm_file = join(tmpdir, basename(hic_norm_file))
    if hic_type == "juicebox":
        HiC_sparse_mat = hic_to_sparse(hic_file, hic_norm_file, hic_resolution)
        HiC = process_hic(
            hic_mat=HiC_sparse_mat,
            hic_norm_file=hic_norm_file,
            hic_is_vc=hic_is_vc,
            resolution=hic_resolution,
            tss_hic_contribution=tss_hic_contribution,
            window=window,
            min_window=min_window,
            gamma=gamma,
            interpolate_nan=interpolate_nan,
            apply_diagonal_bin_correction=apply_diagonal_bin_correction,
        )
        # HiC = juicebox_to_bedpe(HiC, chromosome, args)
    elif hic_type == "bedpe":
        HiC = pd.read_csv(
            hic_file,
            sep="\t",
            names=[
                "chr1", "x1", "x2", "chr2", "y1", "y2", "name", "hic_contact"
            ],
        )

    return HiC
Exemple #4
0
def load_enhancers(
    output_dir: str,
    tmpdir: str,
    candidate_enhancer_regions: str,
    chrom_sizes: str,
    features: dict,
    genes=None,
    force: bool = False,
    skip_rpkm_quantile: bool = False,
    celltype: str = None,
    tss_slop_for_class_assignment: int = 500,
    use_fast_count: bool = True,
    default_accessibility_feature: str = "",
    qnorm: str = None,
    class_override_file: str = None,
):
    makedirs(output_dir)
    makedirs(tmpdir)
    print("made dirs")
    local_candidate_enhancer_regions = join(
        tmpdir, basename(candidate_enhancer_regions))
    download_file(candidate_enhancer_regions, tmpdir, overwrite_ok=True)
    enhancers = read_bed(local_candidate_enhancer_regions)
    enhancers = count_features_for_bed(
        enhancers,
        candidate_enhancer_regions,
        chrom_sizes,
        features,
        tmpdir,
        output_dir,
        "Enhancers",
        skip_rpkm_quantile,
        force,
        use_fast_count,
    )
    return process_enhancer_list(
        enhancers,
        celltype,
        genes,
        tss_slop_for_class_assignment,
        qnorm,
        default_accessibility_feature,
        tmpdir,
        output_dir,
    )
Exemple #5
0
def annotate_genes_with_features(
    genes: dict,
    chrom_sizes: str,
    output_dir: str,
    tmpdir: str,
    skip_gene_counts: bool = False,
    features: dict = {},
    force: bool = False,
    use_fast_count: bool = True,
    default_accessibility_feature: str = "",
):
    # pull down a local copy of chromsizes, as that will be used in several locations downstream
    download_file(chrom_sizes, tmpdir, overwrite_ok=True)
    download_file(chrom_sizes + ".bed", tmpdir, overwrite_ok=True)

    chrom_sizes = join(tmpdir, basename(chrom_sizes))
    # Setup files for counting
    bounds_bed = join(output_dir, "GeneList.bed")
    tss1kb = make_tss_region_file(genes, output_dir, tmpdir, chrom_sizes)

    # Count features over genes and promoters
    genes = count_features_for_bed(
        genes,
        bounds_bed,
        chrom_sizes,
        features,
        tmpdir,
        output_dir,
        "Genes",
        force=force,
        use_fast_count=use_fast_count,
    )
    tsscounts = count_features_for_bed(
        tss1kb["df"],
        tss1kb["path"],
        chrom_sizes,
        features,
        tmpdir,
        output_dir,
        "Genes.TSS1kb",
        force=force,
        use_fast_count=use_fast_count,
    )
    return merge_genes_tss(genes, tsscounts, tmpdir, output_dir,
                           default_accessibility_feature)
Exemple #6
0
def process_gene_bed(
    bed,
    tmpdir: str,
    name_cols: str = "symbol",
    main_name: str = "symbol",
    chrom_sizes: str = None,
    fail_on_nonunique: bool = True,
):
    # get local files from s3
    makedirs(tmpdir)
    local_bed = join(tmpdir, basename(bed))
    print(str(local_bed))
    download_file(bed, tmpdir, overwrite_ok=True)

    if chrom_sizes is not None:
        local_chrom_sizes = join(tmpdir, basename(chrom_sizes))
        print(str(local_chrom_sizes))
        download_file(chrom_sizes, tmpdir, overwrite_ok=True)
    else:
        local_chrom_sizes = None
    bed = read_bed(local_bed)
    return process_gene_bed_filter(bed, name_cols, main_name,
                                   local_chrom_sizes, fail_on_nonunique)
Exemple #7
0
def process_feature_outfile(
    feature_outfile,
    feature_bam,
    feature_name,
    tmpdir,
    df,
    orig_shape,
    skip_rpkm_quantile,
):
    if type(feature_outfile) == list:
        feature_outfile = feature_outfile[0]
    if type(feature_outfile) == File:
        feature_outfile = feature_outfile.path
    if feature_outfile.startswith("s3"):
        download_file(feature_outfile, tmpdir, overwrite_ok=True)
        feature_outfile = join(tmpdir, basename(feature_outfile))
    domain_counts = pd.read_csv(feature_outfile, sep="\t",
                                header=None).rename(columns={
                                    0: "chr",
                                    1: "start",
                                    2: "end"
                                })  # read_bed(feature_outfile)
    score_column = domain_counts.columns[-1]

    total_counts = decode(count_total(feature_bam, tmpdir))
    domain_counts = domain_counts[["chr", "start", "end", score_column]]
    featurecount = feature_name + ".readCount"
    domain_counts.rename(columns={score_column: featurecount}, inplace=True)
    domain_counts["chr"] = domain_counts["chr"].astype("str")

    df = df.merge(domain_counts.drop_duplicates())
    # df = smart_merge(df, domain_counts.drop_duplicates())

    assert df.shape[0] == orig_shape, "Dimension mismatch"
    return annotate_feature_quantiles(df, total_counts, feature_name,
                                      skip_rpkm_quantile, featurecount)
Exemple #8
0
def process_genes(
    genes,
    tmpdir,
    expression_table_list,
    primary_gene_identifier,
    ue_file,
    celltype,
    output_dir,
    class_gene_file,
):
    genes[["chr", "start", "end", "name", "score",
           "strand"]].to_csv(join(tmpdir, "GeneList.bed"),
                             sep="\t",
                             index=False,
                             header=False)
    # upload genes to outputdir
    upload_file(join(tmpdir, "GeneList.bed"), output_dir, overwrite_ok=True)
    if len(expression_table_list) > 0:
        # Add expression information
        names_list = []
        print("Using gene expression from files: {} \n".format(
            expression_table_list))

        for expression_table in expression_table_list:
            try:
                download_file(expression_table, tmpdir, overwrite_ok=True)
                name = basename(expression_table)
                local_expression_table = join(tmpdir,
                                              basename(expression_table))
                expr = pd.read_table(
                    local_expression_table,
                    names=[primary_gene_identifier, name + ".Expression"],
                )
                expr[name + ".Expression"] = expr[name +
                                                  ".Expression"].astype(float)
                expr = expr.groupby(primary_gene_identifier).max()

                genes = genes.merge(expr,
                                    how="left",
                                    right_index=True,
                                    left_on="symbol")
                names_list.append(name + ".Expression")
            except Exception as e:
                print(e)
                traceback.print_exc()
                print("Failed on {}".format(expression_table))
        genes["Expression"] = genes[names_list].mean(axis=1)
        genes["Expression.quantile"] = genes["Expression"].rank(
            method="average", na_option="top", ascending=True, pct=True)
    else:
        genes["Expression"] = np.NaN

    # Ubiquitously expressed annotation
    if ue_file is not None:
        download_file(ue_file, tmpdir, overwrite_ok=True)
        local_ue_file = join(tmpdir, basename(ue_file))
        ubiq = pd.read_csv(local_ue_file, sep="\t")
        genes["is_ue"] = genes["name"].isin(ubiq.iloc[:, 0].values.tolist())

    # cell type
    genes["celltype"] = celltype

    if class_gene_file is None:
        genes_for_class_assignment = genes
    else:
        genes_for_class_assignment = process_gene_bed(
            genes_for_class_assignment,
            gene_name_annotations,
            primary_gene_identifier,
            chrom_sizes,
            fail_on_nonunique=False,
            tmpdir=tmpdir,
        )

    return {
        "genes": genes,
        "genes_for_class_assignment": genes_for_class_assignment
    }