Exemple #1
0
def main():
    p = batch_utils.init_arg_parser(default_cpu=0.5, default_memory=1.75, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    p.add_argument("tsv_path", help="Table with header: sample_id, cram_path, crai_path")
    p.add_argument("sample_id", nargs="*", help="(optional) 1 or more sample_ids to process. If not specified, all rows in the .tsv will be processed.")
    args = p.parse_args()

    df = pd.read_table(args.tsv_path)
    if {"sample_id", "cram_path", "crai_path"} - set(df.columns):
        p.error(f"{args.tsv_path} must contain a 'sample_id', 'cram_path', 'crai_path' columns")

    if not args.force:
        hl.init(log="/dev/null", quiet=True)

    # process samples
    with batch_utils.run_batch(args, batch_name=f"extract chrM") as batch:
        for _, row in df.iterrows():
            if args.sample_id and row.sample_id not in set(args.sample_id):
                continue

            input_filename = os.path.basename(row.cram_path)
            prefix = input_filename.replace(".bam", "").replace(".cram", "")

            output_cram_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram")
            output_crai_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram.crai")

            if not args.force and hl.hadoop_is_file(output_cram_path) and hl.hadoop_is_file(output_crai_path):
                logger.info(f"Output files exist (eg. {output_cram_path}). Skipping {input_filename}...")
                continue

            j = batch_utils.init_job(batch, f"chrM: {row.sample_id}", DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory)
            batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT)

            # copy inputs
            REF_PATHS = batch_utils.HG38_REF_PATHS
            fasta_filename = os.path.basename(parse.urlparse(REF_PATHS.fasta).path)

            j.command(f"""set -ex
                env
                gsutil -m cp {REF_PATHS.fasta} {REF_PATHS.fai} {REF_PATHS.dict} .
                java -Xms2g -jar /gatk.jar PrintReads \
                    -R {fasta_filename} \
                    -I {row.cram_path} \
                    --read-index {row.crai_path} \
                    -L chrM \
                    --gcs-project-for-requester-pays broad-mpg-gnomad \
                    -O {prefix}.chrM.bam
                        
                samtools view -C -T {fasta_filename} {prefix}.chrM.bam > {prefix}.chrM.cram
                samtools index {prefix}.chrM.cram {prefix}.chrM.cram.crai
                
                gsutil -m cp {prefix}.chrM.cram.crai {output_crai_path}
                gsutil -m cp {prefix}.chrM.cram {output_cram_path}
            """)

            logger.info(f"Submitted {row.sample_id}: {output_cram_path}")
Exemple #2
0
def main():
    p = batch_utils.init_arg_parser(default_cpu=1, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    args = p.parse_args()

    # process samples
    with batch_utils.run_batch(args, "test") as batch:
        for cpu in (0.25, 0.5, 1, 2):
            args.cpu = cpu
            j = batch.new_job(f"test - {args.cpu} cpu")
            j.image(DOCKER_IMAGE)
            j.cpu(args.cpu)
            j.memory(args.cpu*3.75)
            #batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT)
            #j.command(f"yes > data.txt || true")
            j.command(f"ls -lh")
            j.command(f"df -kh")
            j.command(f"sleep 3600") # sleep for 0.5 hour
            j.command(f"free -h")
def main():
    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    p.add_argument(
        "--metadata-tsv-path",
        default=ALL_METADATA_TSV,
        help="Table with columns: sample_id, bam_path, bai_path, batch")
    p.add_argument("--counts-tsv-path",
                   default=ALL_COUNTS_TSV_GZ,
                   help="Counts .tsv")

    g = p.add_mutually_exclusive_group()
    g.add_argument("--with-gtex",
                   help="Use GTEX controls.",
                   action="store_true")
    g.add_argument(
        "--only-gtex",
        help="Run on just the GTEX control samples to test FP rate.",
        action="store_true")

    p.add_argument("batch_name",
                   nargs="+",
                   choices=ANALYSIS_BATCHES.keys(),
                   help="Name of RNA-seq batch to process")
    args = p.parse_args()

    if not args.force:
        hl.init(log="/dev/null", quiet=True)

    # process samples
    batch_label = f"OUTRIDER"
    if args.with_gtex:
        batch_label += " (with GTEx)"
    batch_label += ": "
    batch_label += ','.join(args.batch_name)
    with batch_utils.run_batch(args, batch_label) as batch:

        for batch_name in args.batch_name:
            batch_dict = ANALYSIS_BATCHES[batch_name]
            batch_tissue = batch_dict['tissue']
            batch_sex = batch_dict['sex']

            c_vector_of_sample_names = 'c("' + '", "'.join(
                batch_dict['samples']) + '")'
            if args.with_gtex:
                batch_include_GTEX_samples = "TRUE"
                batch_name += "_with_GTEX"
            elif args.only_gtex:
                c_vector_of_sample_names = "c()"
                batch_include_GTEX_samples = "TRUE"
                batch_name += "_only_GTEX"
            else:
                batch_include_GTEX_samples = "FALSE"
                batch_name += "_without_GTEX"

            j = batch_utils.init_job(batch,
                                     batch_name,
                                     DOCKER_IMAGE if not args.raw else None,
                                     args.cpu,
                                     args.memory,
                                     disk_size=10)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)
            # copy inputs
            j.command(f"""gsutil -m cp {GENCODE_TXDB} .""")
            j.command(
                f"""gsutil -m cp {args.metadata_tsv_path} {args.counts_tsv_path} ."""
            )
            output_file = os.path.join(OUTPUT_BASE_DIR, f"{batch_name}.RDS")

            if not args.force and hl.hadoop_is_file(output_file):
                logger.info(
                    f"Output file exists: {output_file} . Skipping {batch_name}..."
                )
                return

            j.command(f"""time xvfb-run Rscript -e '

# outrider 
library(OUTRIDER)
library(annotables)
library(data.table)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(purrr)
library(ggrepel)
library(plotly)
library(stringr)
library(RColorBrewer)
library(ggsci)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)

possibleConfounders = c("tissue", "sex", "stranded", "read_length", "batch")    # "RIN"

# input tables generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/export_gagneur_metadata_table.py
# batches generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/metadata_notebook.py

sampleInfo = fread("{os.path.basename(args.metadata_tsv_path)}")
sampleInfo$read_length = as.character(sampleInfo$read_length)

GTEX_sampleIds = c()
if ({batch_include_GTEX_samples}) {{
    if (("{batch_sex}" == "M") || ("{batch_sex}" == "F")) {{
        GTEX_sampleIds = sampleInfo[(sampleInfo$sex == "{batch_sex}") & (sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id
    }} else {{
        GTEX_sampleIds = sampleInfo[(sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id    
    }}
}}


sampleLabel = "{batch_name}_"
sampleSubset = {c_vector_of_sample_names}
sampleSubset = c(sampleSubset, GTEX_sampleIds)
print("sampleSubset: ")
print(sampleSubset)

sampleInfo = sampleInfo[sampleInfo$sample_id %in% sampleSubset]
if (nrow(sampleInfo) != length(sampleSubset)) {{
    print(paste("ERROR: length(sampleInfo) != length(sampleSubset):", length(sampleInfo), length(sampleSubset)))
    quit("yes")
}}

geneReadCounts = fread("{os.path.basename(args.counts_tsv_path)}", select=c("gene_id", sampleSubset))
geneReadCounts = geneReadCounts[!grep("ERCC", geneReadCounts$geneId),]


geneIds = geneReadCounts$gene_id
colsMiusGeneId = colnames(geneReadCounts)[!colnames(geneReadCounts) %in% c("gene_id")]
geneReadCounts = geneReadCounts[,..colsMiusGeneId]
rownames(geneReadCounts) = geneIds

cnts = as.matrix(geneReadCounts)
rownames(cnts) = geneIds
ncol(cnts)
nrow(cnts)
if (ncol(cnts) != length(sampleSubset)) {{
    print(paste("ERROR: ncol(cnts) != length(sampleSubset):", ncol(cnts), length(sampleSubset)))
    quit("yes")
}}

sampleInfo[,sampleID:=sample_id]
ods <- OutriderDataSet(countData=cnts, colData=sampleInfo)

txdb <- loadDb("{os.path.basename(GENCODE_TXDB)}")
ods <- filterExpression(ods, gtfFile=txdb, filterGenes=FALSE)   #, fpkmCutoff=100)

g = plotFPKM(ods) + theme_bw() + theme(legend.position="bottom")
ggsave(file=paste(sampleLabel, "_plotFPKM.png", sep=""), g, device="png", type="cairo")

#plotExpressedGenes(ods)

ods <- estimateSizeFactors(ods)
sortedSizeFactors = sort(sizeFactors(ods))
g = ggplot(data=NULL, aes(y=sortedSizeFactors, x=1:ncol(ods))) + 
  geom_point(color="blue", size=1) + 
  labs(x="Sample rank", y="Size factors", title="Size factor distribution") + 
  geom_label_repel(aes(label=ifelse(sortedSizeFactors > 1.5, names(sortedSizeFactors), "")), 
                   nudge_x = -35, box.padding = 0.35, point.padding = 0.5, segment.color = "grey50") +
  geom_label_repel(aes(label=ifelse(sortedSizeFactors < 0.5, names(sortedSizeFactors), "")), 
                   nudge_x = 35, box.padding   = 0.35, point.padding = 0.5, segment.color = "grey50") +
  theme_bw()

ggsave(file=paste(sampleLabel, "_sizeFactors.png", sep=""), g, type="cairo")

print(sort(sizeFactors(ods))[1:5])

print(paste(length(ods), "genes before filtering"))
ods <- ods[mcols(ods)$passedFilter,]
print(paste(length(ods), "genes after filtering"))
plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, filename=paste(sampleLabel, "_plotCountCorHeatmap_before_correction.pdf", sep=""))
plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_before_correction.pdf", sep=""))

if (length(sampleSubset) > 5) {{
    ods = findEncodingDim(ods, BPPARAM=MulticoreParam(4, progressbar=TRUE))
    g = plotEncDimSearch(ods)
    ggsave(file=paste(sampleLabel, "_plotEncDimSearch", ".png", sep=""), g, type="cairo")
    optimal_q = metadata(ods)$opt
}} else {{
    optimal_q = length(sampleSubset)
}}

# increase / descrease by 25%

q = optimal_q
original_ods = ods

ods = OUTRIDER(original_ods, verbose=TRUE, iterations=15, q=q, BPPARAM=MulticoreParam(4, progressbar=TRUE))
saveRDS(ods, paste(sampleLabel, "_ods.RDS", sep=""))

plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, main=paste("Count correlation heatmap q=", q, sep=""), filename=paste(sampleLabel, "_plotCountCorHeatmap_after_correction.pdf", sep=""))

plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", main=paste("Count Gene vs Sample Heatmap q=", q, sep=""), device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_after_correction.pdf", sep=""))

res = results(ods, padjCutoff=1)
res = res[,c("sampleID", "geneID", "pValue", "padjust", "zScore", "rawcounts")][order(padjust),]
res[, "q"] = q
write.table(res, file=paste(sampleLabel, "_ods__", "q", q, "_results.tsv", sep=""), quote=FALSE, sep="\\t", row.names=FALSE)
'""")

            j.command("gzip *.tsv")
            j.command(
                f"gsutil -m cp  *.tsv.gz *.pdf *.png *.RDS {OUTPUT_BASE_DIR}")

            logger.info(f"Output: {output_file}")
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()
    #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df()

    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument(
        "-b",
        "--rnaseq-batch-name",
        nargs="*",
        help="RNA-seq batch names to process (eg. -b batch1 batch2)",
        choices=set(rnaseq_sample_metadata_df['star_pipeline_batch'])
        | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"]))
    grp.add_argument(
        "-s",
        "--rnaseq-sample-id",
        nargs="*",
        help="RNA-seq sample IDs to process (eg. -s sample1 sample2)",
        choices=set(rnaseq_sample_metadata_df['sample_id']) | set([
            'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E',
            'GTEX-1KXAM-0005-SM-DIPEC'
        ]))
    args = p.parse_args()

    # Generate samples_df with these columns: sample_id, bam_path, bai_path, output_dir, batch_name, sex, RIN, ancestry, etc.
    samples_df = pd.DataFrame()
    if args.rnaseq_batch_name:
        for batch_name in args.rnaseq_batch_name:
            df = rnaseq_sample_metadata_df[
                rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name]
            samples_df = transfer_metadata_columns_from_df(samples_df, df)

    elif args.rnaseq_sample_id:
        df = rnaseq_sample_metadata_df[
            rnaseq_sample_metadata_df.sample_id.isin(set(
                args.rnaseq_sample_id))]
        samples_df = transfer_metadata_columns_from_df(samples_df, df)
    else:
        p.error("Must specify -b or -s")

    logger.info(
        f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
    )

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    with batch_utils.run_batch(args) as batch:
        for sample_id in samples_df.sample_id:
            metadata_row = samples_df.loc[sample_id]

            # set job inputs & outputs
            input_bam, input_bai = metadata_row['bam_path'], metadata_row[
                'bai_path']
            output_dir = metadata_row['output_dir']

            print("Input bam: ", input_bam)
            output_filename = f"{sample_id}.bigWig"
            output_file_path = os.path.join(output_dir, output_filename)

            # check if output file already exists
            if hl.hadoop_is_file(output_file_path) and not args.force:
                logger.info(
                    f"{sample_id} output file already exists: {output_file_path}. Skipping..."
                )
                continue

            file_stats = hl.hadoop_stat(metadata_row['bam_path'])
            bam_size = int(round(file_stats['size_bytes'] / 10.**9))
            disk_size = bam_size * 2

            j = batch_utils.init_job(batch,
                                     f"bam=>bigWig: {sample_id}",
                                     cpu=args.cpu,
                                     memory=args.memory,
                                     disk_size=disk_size,
                                     image=DOCKER_IMAGE)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)

            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/references/GRCh38.chrsizes ."
            )
            j.command(f"touch {sample_id}.bam.bai")

            j.command(f"pwd && ls && date")

            j.command(
                f"python3 /src/bam2coverage.py {sample_id}.bam GRCh38.chrsizes {sample_id}"
            )
            j.command(f"cp {output_filename} {j.output_bigWig}")
            j.command(f"echo Done: {output_file_path}")
            j.command(f"date")

            # copy output
            batch.write_output(j.output_bigWig, output_file_path)

            print("Output file path: ", output_file_path)
Exemple #5
0
def main():
    p, args = parse_args()

    df = pd.read_table(args.cram_and_tsv_paths_table)
    if {"sample_id", "cram_path", "crai_path", "variants_tsv_bgz"} - set(
            df.columns):
        p.error(
            f"{args.tsv_path} must contain 'sample_id', 'cram_path' columns")

    # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster
    batch_utils.set_gcloud_project(GCLOUD_PROJECT)
    if args.cluster:
        batch_utils.check_storage_bucket_region(df.cram_path)

    if not args.force:
        hl.init(log="/dev/null", quiet=True)

    # process samples
    with batch_utils.run_batch(args,
                               batch_name=f"HaplotypeCaller -bamout") as batch:
        counter = 0
        for _, row in tqdm.tqdm(df.iterrows(), unit=" rows", total=len(df)):
            if args.sample_to_process and row.sample_id not in set(
                    args.sample_to_process):
                continue

            input_filename = os.path.basename(row.cram_path)
            output_prefix = input_filename.replace(".bam",
                                                   "").replace(".cram", "")

            output_bam_path = os.path.join(args.output_dir,
                                           f"{output_prefix}.bamout.bam")
            output_bai_path = os.path.join(args.output_dir,
                                           f"{output_prefix}.bamout.bai")

            if not args.force and hl.hadoop_is_file(
                    output_bam_path) and hl.hadoop_is_file(output_bai_path):
                logger.info(
                    f"Output files exist (eg. {output_bam_path}). Skipping {input_filename}..."
                )
                continue

            counter += 1
            if args.num_samples_to_process and counter > args.num_samples_to_process:
                break

            j = batch_utils.init_job(batch, f"readviz: {row.sample_id}",
                                     DOCKER_IMAGE if not args.raw else None,
                                     args.cpu, args.memory)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT)

            local_exclude_intervals = batch_utils.localize_file(
                j, EXCLUDE_INTERVALS)
            local_fasta = batch_utils.localize_file(
                j, batch_utils.HG38_REF_PATHS.fasta, use_gcsfuse=True)
            local_fasta_fai = batch_utils.localize_file(
                j, batch_utils.HG38_REF_PATHS.fai, use_gcsfuse=True)
            batch_utils.localize_file(j,
                                      batch_utils.HG38_REF_PATHS.dict,
                                      use_gcsfuse=True)
            local_tsv_bgz = batch_utils.localize_file(j, row.variants_tsv_bgz)
            local_cram_path = batch_utils.localize_file(j, row.cram_path)
            local_crai_path = batch_utils.localize_file(j, row.crai_path)

            j.command(f"""echo --------------

echo "Start - time: $(date)"
df -kh


# 1) Convert variants_tsv_bgz to sorted interval list

gunzip -c "{local_tsv_bgz}" | awk '{{ OFS="\t" }} {{ print( "chr"$1, $2, $2 ) }}' | bedtools slop -b {PADDING_AROUND_VARIANT} -g {local_fasta_fai} > variant_windows.bed

# Sort the .bed file so that chromosomes are in the same order as in the input_cram file.
# Without this, if the input_cram has a different chromosome ordering (eg. chr1, chr10, .. vs. chr1, chr2, ..)
# than the interval list passed to GATK tools' -L arg, then GATK may silently skip some of regions in the -L intervals.
# The sort is done by first retrieving the input_cram header and passing it to GATK BedToIntervalList.

java -Xms2g -jar /gatk/gatk.jar PrintReadsHeader \
	--gcs-project-for-requester-pays {GCLOUD_PROJECT} \
	-R {local_fasta} \
	-I "{local_cram_path}" \
	-O header.bam

java -Xms2g -jar /gatk/gatk.jar BedToIntervalList \
	--SORT true \
	--SEQUENCE_DICTIONARY header.bam \
	--INPUT variant_windows.bed \
	--OUTPUT variant_windows.interval_list

# 2) Get reads from the input_cram for the intervals in variant_windows.interval_list

time java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+DisableAttachMechanism -XX:MaxHeapSize=2000m -Xmx30000m \
	-jar /gatk/GATK35.jar \
	-T HaplotypeCaller \
	-R {local_fasta} \
	-I "{local_cram_path}" \
	-L variant_windows.interval_list \
	-XL {local_exclude_intervals} \
	--disable_auto_index_creation_and_locking_when_reading_rods \
	-ERC GVCF \
	--max_alternate_alleles 3 \
	-variant_index_parameter 128000 \
	-variant_index_type LINEAR \
	--read_filter OverclippedRead \
	-bamout "{output_prefix}.bamout.bam" \
	-o "{output_prefix}.gvcf"  |& grep -v "^DEBUG"

bgzip "{output_prefix}.gvcf"
tabix "{output_prefix}.gvcf.gz"

gsutil -m cp "{output_prefix}.bamout.bam" {args.output_dir}
gsutil -m cp "{output_prefix}.bamout.bai" {args.output_dir}
gsutil -m cp "{output_prefix}.gvcf.gz" {args.output_dir}
gsutil -m cp "{output_prefix}.gvcf.gz.tbi" {args.output_dir}

ls -lh
echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo --------------

""")
Exemple #6
0
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()
    #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df()

    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument(
        "-b",
        "--rnaseq-batch-name",
        nargs="*",
        help="RNA-seq batch names to process (eg. -b batch1 batch2)",
        choices=set(rnaseq_sample_metadata_df['star_pipeline_batch'])
        | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"]))
    grp.add_argument(
        "-s",
        "--rnaseq-sample-id",
        nargs="*",
        help="RNA-seq sample IDs to process (eg. -s sample1 sample2)",
        choices=set(rnaseq_sample_metadata_df['sample_id']) | set([
            'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E',
            'GTEX-1KXAM-0005-SM-DIPEC'
        ]))
    args = p.parse_args()

    # Generate samples_df with these columns: sample_id, star_SJ_out_tab, output_dir, batch_name
    samples_df = pd.DataFrame()
    if args.rnaseq_batch_name:
        for batch_name in args.rnaseq_batch_name:
            df = rnaseq_sample_metadata_df[
                rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name]
            samples_df = transfer_metadata_columns_from_df(samples_df, df)

    elif args.rnaseq_sample_id:
        df = rnaseq_sample_metadata_df[
            rnaseq_sample_metadata_df.sample_id.isin(set(
                args.rnaseq_sample_id))]
        samples_df = transfer_metadata_columns_from_df(samples_df, df)
    else:
        p.error("Must specify -b or -s")

    logger.info(
        f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
    )

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    with batch_utils.run_batch(args) as batch:
        for sample_id in samples_df.sample_id:
            metadata_row = samples_df.loc[sample_id]

            # set job inputs & outputs
            output_dir = metadata_row['output_dir']

            print("Input file: ", metadata_row['star_SJ_out_tab'])
            output_filename = f"{sample_id}.junctions.bed.gz"
            output_bed_gz_file_path = os.path.join(output_dir, output_filename)

            # check if output file already exists
            if hl.hadoop_is_file(output_bed_gz_file_path) and not args.force:
                logger.info(
                    f"{sample_id} output file already exists: {output_bed_gz_file_path}. Skipping..."
                )
                continue

            j = batch_utils.init_job(batch,
                                     name=f"tab=>bed: {sample_id}",
                                     cpu=args.cpu,
                                     memory=args.memory,
                                     disk_size=5,
                                     image=DOCKER_IMAGE)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)

            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {metadata_row['star_SJ_out_tab']} ."
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.gff3.gz ."
            )
            j.command(f"pwd && ls && date")

            j.command(
                f"python3 /convert_SJ_out_tab_to_junctions_bed.py -g gencode.v26.annotation.gff3.gz {os.path.basename(metadata_row['star_SJ_out_tab'])}"
            )
            j.command(f"cp {output_filename} {j.output_bed_gz}")
            j.command(f"cp {output_filename}.tbi {j.output_bed_gz_tbi}")
            j.command(f"echo Done: {output_bed_gz_file_path}")
            j.command(f"date")

            # copy output
            batch.write_output(j.output_bed_gz, output_bed_gz_file_path)
            batch.write_output(j.output_bed_gz_tbi,
                               f"{output_bed_gz_file_path}.tbi")

            print("Output file path: ", output_bed_gz_file_path)
Exemple #7
0
def main():
    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    p.add_argument("--with-gtex",
                   help="Use GTEX controls.",
                   action="store_true")
    p.add_argument("--skip-step1",
                   action="store_true",
                   help="Skip count-split-reads step")
    p.add_argument("--skip-step2",
                   action="store_true",
                   help="Skip compute-PSI step")
    p.add_argument("--skip-step3",
                   action="store_true",
                   help="Skip compute-best-Q step")
    p.add_argument("-m1",
                   "--memory-step1",
                   type=float,
                   help="Batch: (optional) memory in gigabytes (eg. 3.75)",
                   default=3.75)
    p.add_argument("-m2",
                   "--memory-step2",
                   type=float,
                   help="Batch: (optional) memory in gigabytes (eg. 3.75)",
                   default=3.75)
    p.add_argument(
        "--metadata-tsv-path",
        default=ALL_METADATA_TSV,
        help="Table with columns: sample_id, bam_path, bai_path, batch")
    p.add_argument("batch_name",
                   nargs="+",
                   choices=ANALYSIS_BATCHES.keys(),
                   help="Name of RNA-seq batch to process")
    args = p.parse_args()

    hl.init(log="/dev/null", quiet=True)

    with hl.hadoop_open(args.metadata_tsv_path) as f:
        samples_df_unmodified = pd.read_table(f).set_index("sample_id",
                                                           drop=False)

    batch_label = f"FRASER"
    if args.with_gtex:
        batch_label += " (with GTEx)"
    batch_label += ": "
    batch_label += ','.join(args.batch_name)
    with batch_utils.run_batch(args, batch_label) as batch:

        for batch_name in args.batch_name:
            samples_df = samples_df_unmodified
            batch_dict = ANALYSIS_BATCHES[batch_name]
            batch_tissue = batch_dict['tissue']
            batch_sex = batch_dict['sex']

            sample_ids = list(batch_dict['samples'])
            if args.with_gtex:
                batch_name += "_with_GTEX"
                samples_df_filter = (samples_df.tissue == batch_tissue)
                samples_df_filter &= samples_df.sample_id.str.startswith(
                    "GTEX")
                if batch_sex == "M" or batch_sex == "F":
                    samples_df_filter &= (samples_df.sex == batch_sex)
                sample_ids += list(samples_df[samples_df_filter].sample_id)
            else:
                batch_name += "_without_GTEX"

            samples_df = samples_df.loc[sample_ids]
            byte_string = ", ".join(sorted(samples_df.sample_id)).encode()
            h = hashlib.md5(byte_string).hexdigest().upper()
            sample_set_label = f"{batch_name}__{len(samples_df.sample_id)}_samples_{h[:10]}"

            logger.info(
                f"Processing {sample_set_label}: {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
            )

            split_reads_samples = []

            split_reads_output_files = []
            split_reads_jobs = {}

            non_split_reads_output_files = []
            non_split_reads_jobs = {}

            j_extract_splice_junctions = None
            j_calculate_psi_values = None
            j_calculate_best_q = None

            # based on docs @ https://bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf
            # step 1: count spliced reads
            # step 2: count non-spliced reads at acceptors & donors of splice junctions detected in step 1
            for step in 1, 2:
                for sample_id in samples_df.sample_id:
                    metadata_row = samples_df.loc[sample_id]

                    # set job inputs & outputs
                    input_bam, input_bai = metadata_row[
                        'bam_path'], metadata_row['bai_path']
                    if "GTEX" in sample_id:
                        output_dir_for_sample_specific_data = "gs://macarthurlab-rnaseq/gtex_v8/fraser_count_rna/"
                    else:
                        output_dir_for_sample_specific_data = f"gs://macarthurlab-rnaseq/{metadata_row['batch']}/fraser_count_rna/"

                    output_dir_for_batch_specific_data = f"gs://macarthurlab-rnaseq/gagneur/fraser/results/{sample_set_label}"

                    output_file_path_splice_junctions_RDS = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"spliceJunctions_{sample_set_label}.RDS")
                    output_file_path_calculated_psi_values_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"calculatedPSIValues_{sample_set_label}.tar.gz")
                    output_file_path_calculated_best_q_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"calculatedBestQ_{sample_set_label}.tar.gz")
                    output_file_path_fraser_analysis_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"fraserAnalysis_using_PCA_{sample_set_label}.tar.gz")
                    output_file_path_fraser_analysis_results_only_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"fraserAnalysis_using_PCA_{sample_set_label}_results_only.tar.gz"
                    )

                    print("Input bam: ", input_bam)
                    if step == 1:
                        output_file_path = os.path.join(
                            output_dir_for_sample_specific_data,
                            f"fraser_count_split_reads_{sample_id}.tar.gz")
                        memory = args.memory_step1
                    elif step == 2:
                        output_file_path = os.path.join(
                            output_dir_for_batch_specific_data,
                            f"fraser_count_non_split_reads_{sample_id}__{sample_set_label}.tar.gz"
                        )
                        memory = args.memory_step2

                    if step == 1:
                        split_reads_samples.append(sample_id)
                        split_reads_output_files.append(output_file_path)
                    elif step == 2:
                        non_split_reads_output_files.append(output_file_path)

                    if (step == 1
                            and args.skip_step1) or (step == 2
                                                     and args.skip_step2):
                        continue

                    # check if output file already exists
                    if not args.force and hl.hadoop_is_file(output_file_path):
                        logger.info(
                            f"{sample_id} output file already exists: {output_file_path}. Skipping..."
                        )
                        continue

                    if not args.local:
                        file_stats = hl.hadoop_stat(metadata_row['bam_path'])
                        bam_size = int(round(file_stats['size_bytes'] /
                                             10.**9))
                        disk_size = bam_size * 2
                    else:
                        disk_size = None

                    job_label = f"Count {'split' if step == 1 else 'non-split'} reads"
                    j = batch_utils.init_job(batch,
                                             f"{job_label}: {sample_id}",
                                             cpu=args.cpu,
                                             memory=memory,
                                             disk_size=disk_size,
                                             image=DOCKER_IMAGE)
                    batch_utils.switch_gcloud_auth_to_user_account(
                        j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                        GCLOUD_PROJECT)

                    j.command(
                        f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam"
                    )
                    j.command(
                        f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai"
                    )
                    j.command(f"touch {sample_id}.bam.bai")
                    bam_path = f"{sample_id}.bam"

                    j.command(f"pwd && ls -lh && date")

                    if step == 1:
                        # count split reads
                        j.command(f"""time xvfb-run Rscript -e '
library(FRASER)
library(data.table)

sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}"))
print(sampleTable)
fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L)

getSplitReadCountsForAllSamples(fds)  # saves results to cache/
'""")
                    elif step == 2:
                        if sample_id in split_reads_jobs:
                            j.depends_on(split_reads_jobs[sample_id])
                        if j_extract_splice_junctions:
                            j.depends_on(j_extract_splice_junctions)

                        j.command(
                            f"gsutil -m cp {output_file_path_splice_junctions_RDS} ."
                        )

                        # count non-split reads
                        j.command(f"""time xvfb-run Rscript -e '
library(FRASER)
library(data.table)

spliceJunctions = readRDS("{os.path.basename(output_file_path_splice_junctions_RDS)}")

sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}"))
print(sampleTable)

fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L)

getNonSplitReadCountsForAllSamples(fds, spliceJunctions)  # saves results to cache/
'""")
                    j.command(f"ls -lh .")
                    j.command(
                        f"tar czf {os.path.basename(output_file_path)} cache")
                    j.command(
                        f"gsutil -m cp {os.path.basename(output_file_path)} {output_file_path}"
                    )

                    j.command(f"echo Done: {output_file_path}")
                    j.command(f"date")

                    print("Output file path: ", output_file_path)

                    if step == 1:
                        split_reads_jobs[sample_id] = j
                    elif step == 2:
                        non_split_reads_jobs[sample_id] = j

                if len(split_reads_output_files) == 0:
                    break

                if step == 1 and not args.skip_step1:
                    if hl.hadoop_is_file(output_file_path_splice_junctions_RDS
                                         ) and not args.force:
                        logger.info(
                            f"{output_file_path_splice_junctions_RDS} file already exists. Skipping extractSpliceJunctions step..."
                        )
                        continue

                    j_extract_splice_junctions = batch_utils.init_job(
                        batch,
                        f"{sample_set_label}: Extract splice-junctions",
                        disk_size=30,
                        memory=60,
                        image=DOCKER_IMAGE)
                    for j in split_reads_jobs.values():
                        j_extract_splice_junctions.depends_on(j)

                    extract_splice_junctions(
                        j_extract_splice_junctions, split_reads_output_files,
                        args.cpu, output_file_path_splice_junctions_RDS)

                elif step == 2 and not args.skip_step2:
                    if hl.hadoop_is_file(
                            output_file_path_calculated_psi_values_tar_gz
                    ) and not args.force:
                        logger.info(
                            f"{output_file_path_calculated_psi_values_tar_gz} file already exists. Skipping calculatePSIValues step..."
                        )
                        continue

                    num_cpu = 4 if args.local else 16
                    memory = 60
                    j_calculate_psi_values = batch_utils.init_job(
                        batch,
                        f"{sample_set_label}: Calculate PSI values",
                        disk_size=50,
                        cpu=num_cpu,
                        memory=memory,
                        image=DOCKER_IMAGE)
                    if j_extract_splice_junctions:
                        j_calculate_psi_values.depends_on(
                            j_extract_splice_junctions)
                    for j in non_split_reads_jobs.values():
                        j_calculate_psi_values.depends_on(j)

                    calculate_psi_values(
                        j_calculate_psi_values, sample_set_label,
                        split_reads_output_files, non_split_reads_output_files,
                        output_file_path_splice_junctions_RDS,
                        args.metadata_tsv_path, num_cpu,
                        output_file_path_calculated_psi_values_tar_gz)

            # compute Best Q
            if args.skip_step3:
                logger.info(f"Skipping calculatedBestQ step...")
            elif hl.hadoop_is_file(output_file_path_calculated_best_q_tar_gz
                                   ) and not args.force:
                logger.info(
                    f"{output_file_path_calculated_best_q_tar_gz} file already exists. Skipping calculatedBestQ step..."
                )
            else:
                num_cpu = 4 if args.local else 16
                memory = 3.75 * num_cpu

                j_calculate_best_q = batch_utils.init_job(
                    batch,
                    f"{sample_set_label}: Calculate Best Q",
                    disk_size=50,
                    cpu=num_cpu,
                    memory=memory,
                    image=DOCKER_IMAGE)

                if j_calculate_psi_values:
                    j_calculate_best_q.depends_on(j_calculate_psi_values)

                calculate_best_q(
                    j_calculate_best_q, sample_set_label, 4,
                    output_file_path_calculated_psi_values_tar_gz,
                    output_file_path_calculated_best_q_tar_gz)

            # output_file_path_fraser_analysis_tar_gz
            if hl.hadoop_is_file(
                    output_file_path_fraser_analysis_results_only_tar_gz
            ) and not args.force:
                logger.info(
                    f"{output_file_path_fraser_analysis_results_only_tar_gz} file already exists. Skipping run_fraser_analysis step..."
                )
            else:
                num_cpu = 4 if args.local else 16
                memory = 3.75 * num_cpu

                j_fraser_analysis = batch_utils.init_job(
                    batch,
                    f"{sample_set_label}: Run Fraser Analysis",
                    disk_size=50,
                    cpu=num_cpu,
                    memory=memory,
                    image=DOCKER_IMAGE)
                if j_calculate_best_q:
                    j_fraser_analysis.depends_on(j_calculate_best_q)

                run_fraser_analysis(
                    j_fraser_analysis, sample_set_label, 4,
                    output_file_path_calculated_best_q_tar_gz,
                    output_file_path_fraser_analysis_tar_gz,
                    output_file_path_fraser_analysis_results_only_tar_gz)
Exemple #8
0
def main():

    p, args = parse_args()

    df = pd.read_table(args.cram_and_tsv_paths_table)
    if {"sample_id", "output_bamout_bam", "output_bamout_bai", "variants_tsv_bgz"} - set(df.columns):
        p.error(f"{args.tsv_path} must contain 'sample_id', 'output_bamout_bam', 'variants_tsv_bgz' columns")

    if args.num_samples_to_process:
        if args.random:
            df = df.sample(n=args.num_samples_to_process)
        else:
            df = df.iloc[:args.num_samples_to_process]

    if args.sample_to_process:
        df = df[df.sample_id.isin(set(args.sample_to_process))]

    logging.info(f"Processing {len(df)} samples")

    # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster
    batch_utils.set_gcloud_project(GCLOUD_PROJECT)
    with open("deidentify_bamout.py", "rt") as f:
        deidentify_bamouts_script = f.read()

    # process sample(s)
    if not args.sample_to_process and not args.num_samples_to_process:
        # if processing entire table, listing all files up front ends up being faster
        existing_deidentify_output_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.bam", shell=True, encoding="UTF-8").strip().split("\n")
        existing_deidentify_output_sorted_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.sorted.bam", shell=True, encoding="UTF-8").strip().split("\n")

    hl.init(log="/dev/null")
    with batch_utils.run_batch(args, batch_name=f"deidentify bamouts: {len(df)} samples") as batch:
        for _, row in tqdm.tqdm(df.iterrows(), unit=" samples"):
            output_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.bam")
            output_sorted_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.sorted.bam")

            if args.sample_to_process or args.num_samples_to_process:
                run_deidentify = args.force or not hl.hadoop_is_file(output_bam_path)
                run_sort = run_deidentify or not hl.hadoop_is_file(output_sorted_bam_path)
            else:
                run_deidentify = args.force or output_bam_path not in existing_deidentify_output_bams
                run_sort = run_deidentify or output_sorted_bam_path not in existing_deidentify_output_sorted_bams

            if run_deidentify or run_sort:
                bamout_stat = hl.hadoop_stat(row.output_bamout_bam)
                cpu = 0.25
                if bamout_stat['size_bytes'] > 0.25 * 20_000_000_000:
                    cpu = 0.5
                if bamout_stat['size_bytes'] > 0.5 * 20_000_000_000:
                    cpu = 1
                if bamout_stat['size_bytes'] > 1 * 20_000_000_000:
                    cpu = 2

            if run_deidentify:
                j = batch_utils.init_job(batch, f"{row.sample_id} - deidentify - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu, disk_size=21*cpu)
                batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT)

                local_tsv_path = batch_utils.localize_file(j, row.variants_tsv_bgz, use_gcsfuse=True)
                local_exclude_tsv_path = batch_utils.localize_file(j, row.exclude_variants_tsv_bgz, use_gcsfuse=True)
                local_bamout_path = batch_utils.localize_file(j, row.output_bamout_bam, use_gcsfuse=True)

                batch_utils.localize_file(j, row.output_bamout_bai, use_gcsfuse=True)

                j.command(f"""echo --------------

echo "Start - time: $(date)"
df -kh

cat <<EOF > deidentify_bamout.py
{deidentify_bamouts_script}
EOF

time python3 deidentify_bamout.py -x "{local_exclude_tsv_path}" "{row.sample_id}" "{local_bamout_path}" "{local_tsv_path}"

ls -lh

gsutil -m cp "{row.sample_id}.deidentify_output.bam" {args.output_dir}/
gsutil -m cp "{row.sample_id}.deidentify_output.db"  {args.output_dir}/

echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo --------------

""")
            else:
                logger.info(f"Skipping deidentify {row.sample_id}...")

            if run_sort:
                j2 = batch_utils.init_job(batch, f"{row.sample_id} - sort - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu)
                batch_utils.switch_gcloud_auth_to_user_account(j2, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT)

                if run_deidentify:
                    j2.depends_on(j)

                local_bamout_path = batch_utils.localize_file(j2, output_bam_path, use_gcsfuse=True)

                j2.command(f"""echo --------------

echo "Start - time: $(date)"
df -kh

samtools sort -o "{row.sample_id}.deidentify_output.sorted.bam" "{local_bamout_path}"
samtools index "{row.sample_id}.deidentify_output.sorted.bam"

ls -lh

gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam"      {args.output_dir}/
gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam.bai"  {args.output_dir}/

echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo --------------

""")
            elif run_sort:
                logger.info(f"Sorted output files exist (eg. {output_sorted_bam_path}). Skipping sort for {row.sample_id}...")
Exemple #9
0
def main():
    p = batch_utils.init_arg_parser(
        default_cpu=NUM_CPU,
        default_memory=NUM_CPU*3.75,
        gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json"))

    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("--all", action="store_true", help="run all samples")
    grp.add_argument("-s", "--sample", help="process specific sample name(s)", action="append")
    grp.add_argument("-n", "--n-samples", type=int, help="run on the 1st n samples only. Useful for debugging")
    p.add_argument("--offset", type=int, default=0, help="apply this offset before applying -n. Useful for debugging")
    p.add_argument("--model", help="Which DeepTrio model to use", choices={"WES", "WGS", "PACBIO"}, required=True)

    p.add_argument("trios_tsv", help="Trios tsv", default="trios.tsv")
    args = p.parse_args()

    if not os.path.isfile(args.trios_tsv):
        p.error(f"File not found: {args.trios_tsv}")

    if args.trios_tsv.endswith(".xls") or args.trios_tsv.endswith(".xlsx"):
        df = pd.read_excel(args.trios_tsv)
    else:
        df = pd.read_table(args.trios_tsv)

    missing_columns = EXPECTED_COLUMNS - set(df.columns)
    if missing_columns:
        p.error(f"{args.trios_tsv} is missing columns: {missing_columns}")

    if args.n_samples:
        df = df[args.offset:args.offset+args.n_samples]

    if args.sample:
        df = df[df.sample_id.isin(set(args.sample))]
        if len(df) < len(set(filter(None, args.sample))):
            p.error(", ".join(set(args.sample) - set(df.sample_id)) + ": sample ids not found or don't have a bam file path")
        logger.info(f"Processing {len(df)} sample(s): " + ", ".join(list(df.sample_id[:10])))
    else:
        logger.info(f"Processing all {len(df)} samples")

    output_subdir = ".".join(os.path.basename(args.trios_tsv).split(".")[:-1])
    existing_output_files = batch_utils.generate_path_to_file_size_dict(
        os.path.join(OUTPUT_BASE_DIR, f"{output_subdir}/results_*.tar.gz"))

    # process samples
    with batch_utils.run_batch(args, batch_name=f"DeepTrio: " + (", ".join(df.individual_id) if len(df) < 5 else f"{len(df)} trio(s)")) as batch:
        for i, row in df.iterrows():
            name = re.sub(".bam$|.cram$", "", os.path.basename(row.reads))
            name_parent1 = re.sub(".bam$|.cram$", "", os.path.basename(row.parent1_reads))
            name_parent2 = re.sub(".bam$|.cram$", "", os.path.basename(row.parent2_reads))

            output_file = os.path.join(OUTPUT_BASE_DIR, f"{output_subdir}/results_{name}.tar.gz")
            if not args.force and output_file in existing_output_files:
                logger.info(f"Output file exists: {output_file} . Skipping {row.individual_id}...")
                continue

            # init Job
            j = batch_utils.init_job(batch, None, DEEP_TRIO_DOCKER_IMAGE_WITHOUT_GPU if not args.raw else None, cpu=NUM_CPU)
            batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT)

            # localize files
            local_ref_fasta_path = batch_utils.localize_file(j, row.ref_fasta, use_gcsfuse=True)
            local_reads_path = batch_utils.localize_via_temp_bucket(j, row.reads)
            local_parent1_reads_path = batch_utils.localize_via_temp_bucket(j, row.parent1_reads)
            local_parent2_reads_path = batch_utils.localize_via_temp_bucket(j, row.parent2_reads)

            batch_utils.localize_file(j, row.ref_fasta_fai, use_gcsfuse=True)
            batch_utils.localize_via_temp_bucket(j, row.reads_index)
            batch_utils.localize_via_temp_bucket(j, row.parent1_reads_index)
            batch_utils.localize_via_temp_bucket(j, row.parent2_reads_index)

            local_ref_cache_tar_gz_path = batch_utils.localize_file(j, "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.ref_cache.tar.gz", use_gcsfuse=True)

            # --regions chr22:38982347-38992804 \
            j.command(f"""mkdir ref_cache
            
            cd ref_cache
            tar xzf {local_ref_cache_tar_gz_path} 2>&1 | grep -v '^tar:' || true
            export REF_PATH="/ref_cache/ref/cache/%2s/%2s/%s:http://www.ebi.ac.uk/ena/cram/md5/%s"
            export REF_CACHE="/ref_cache/ref/cache/%2s/%2s/%s"
            
            mkdir "/results_{name}"
            cd "/results_{name}"

            /opt/deepvariant/bin/deeptrio/run_deeptrio \
                --model_type {args.model} \
                --ref {local_ref_fasta_path} \
                --reads_child "{local_reads_path}" \
                --reads_parent1 "{local_parent1_reads_path}" \
                --reads_parent2 "{local_parent2_reads_path}" \
                --output_gvcf_child "variants_{name}.gvcf.gz" \
                --output_gvcf_parent1 "variants_{name_parent1}.gvcf.gz" \
                --output_gvcf_parent2 "variants_{name_parent2}.gvcf.gz" \
                --output_vcf_child "variants_{name}.vcf.gz" \
                --output_vcf_parent1 "variants_{name_parent1}.vcf.gz" \
                --output_vcf_parent2 "variants_{name_parent2}.vcf.gz" \
                --sample_name_child "{name}" \
                --sample_name_parent1 "{name_parent1}" \
                --sample_name_parent2 "{name_parent2}" \
                --vcf_stats_report

            rm *.gvcf.gz*

            cd /
            tar czf "results_{name}.tar.gz" "/results_{name}"
            gsutil -m cp "results_{name}.tar.gz" {output_file}""")
Exemple #10
0
        "--save-individual-tables",
        action="store_true",
        help="Also export individual .bed files with additional columns")
    p.add_argument("batch_name",
                   nargs="+",
                   choices=analysis_batches | star_pipeline_batches,
                   help="Name of RNA-seq batch to process")
    args = p.parse_args()

    if not args.force:
        hl.init(log="/dev/null", quiet=True)

    # process batches
    batch_label = args.batch_name[0] if len(
        args.batch_name) == 1 else f"{len(args.batch_name)} batches"
    with batch_utils.run_batch(
            args, batch_name=f"combine junctions: {batch_label}") as batch:
        for batch_name in args.batch_name:
            if batch_name in star_pipeline_batches:
                output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/combined_SJ_out_tables/"
                SJ_out_tab_paths = list(rnaseq_sample_metadata_df[
                    rnaseq_sample_metadata_df["star_pipeline_batch"] ==
                    batch_name].star_SJ_out_tab)
            elif batch_name in analysis_batches:
                output_dir = f"gs://macarthurlab-rnaseq/combined_SJ_out_tables/{batch_name}/"
                SJ_out_tab_paths = rnaseq_sample_metadata_df[
                    rnaseq_sample_metadata_df["sample_id"].isin(
                        ANALYSIS_BATCHES[batch_name]
                        ["samples"])].star_SJ_out_tab
            else:
                p.error(f"Unexpected batch name: {batch_name}")
def main():
    p, args = parse_args()

    df = pd.read_table(args.cram_and_tsv_paths_table)
    if {
            "sample_id", "output_bamout_bam", "output_bamout_bai",
            "variants_tsv_bgz"
    } - set(df.columns):
        p.error(
            f"{args.tsv_path} must contain 'sample_id', 'output_bamout_bam', 'variants_tsv_bgz' columns"
        )

    num_groups = int(math.ceil(len(df) / args.group_size))
    logging.info(
        f"Creating {num_groups} group(s) with {args.group_size} samples in each"
    )

    groups = []
    for i in range(num_groups):
        if args.num_groups_to_process and i >= args.num_groups_to_process:
            break
        group = df.iloc[i::num_groups]
        groups.append(group)

        logging.info(f"--- group #{i}:")
        logging.info(group)

    # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster
    batch_utils.set_gcloud_project(GCLOUD_PROJECT)

    if not args.skip_step1:
        existing_combined_bamout_bams = batch_utils.generate_path_to_file_size_dict(
            f"{OUTPUT_BUCKET}/*.bam")
        input_bam_size_dict = batch_utils.generate_path_to_file_size_dict(
            f"{INPUT_BAM_BUCKET}/*.deidentify_output.sorted.bam")

    if not args.skip_step2:
        existing_combined_dbs = batch_utils.generate_path_to_file_size_dict(
            f"{OUTPUT_BUCKET}/*.chr*.db")
        input_db_size_dict = batch_utils.generate_path_to_file_size_dict(
            f"{INPUT_BAM_BUCKET}/*.deidentify_output.db")

    # process groups
    with batch_utils.run_batch(
            args,
            batch_name=
            f"combine readviz bams: {len(groups)} group(s) (gs{args.group_size}_gn{num_groups}__s{len(df)})"
    ) as batch:
        chrom_to_combine_db_jobs = collections.defaultdict(list)
        chrom_to_combined_db_paths = collections.defaultdict(list)
        errors = 0
        temp_dir = "./temp_sql_files__combine_group"
        for i, group in enumerate(tqdm.tqdm(groups, unit=" groups")):
            md5_hash = hashlib.md5(", ".join(sorted(list(
                group.sample_id))).encode('utf-8')).hexdigest()
            combined_bamout_id = f"s{len(df)}_gs{args.group_size}_gn{num_groups}_gi{i:04d}_h{md5_hash[-9:]}"

            for chrom in ALL_CHROMOSOMES:
                chrom_to_combined_db_paths[chrom].append(
                    f"{args.output_dir}/{combined_bamout_id}.chr{chrom}.db")

            if not args.skip_step1 and not args.db_names_to_process:
                errors += combine_bam_files_in_group(
                    args, batch, combined_bamout_id, group,
                    input_bam_size_dict, existing_combined_bamout_bams)

            if not args.skip_step2:
                errors += combine_db_files_in_group_for_chrom(
                    args,
                    batch,
                    combined_bamout_id,
                    group,
                    chrom_to_combine_db_jobs,
                    input_db_size_dict,
                    existing_combined_dbs,
                    temp_dir=temp_dir)

        if not args.skip_step2:
            os.system(f"gsutil -m cp -r {temp_dir} gs://gnomad-bw2/")

        temp_dir = "./temp_sql_files__combine_all_per_chrom"
        if not args.skip_step3 and not args.num_groups_to_process and not errors:
            # only do this after processing all groups
            combine_all_dbs_for_chrom(
                args,
                batch,
                f"s{len(df)}_gs{args.group_size}_gn{num_groups}",
                chrom_to_combined_db_paths,
                chrom_to_combine_db_jobs,
                temp_dir=temp_dir)
            os.system(f"gsutil -m cp -r {temp_dir} gs://gnomad-bw2/")