Exemple #1
0
    def test_hadoop_stat(self):
        path1 = resource('ls_test')
        stat1 = hl.hadoop_stat(path1)
        self.assertEqual(stat1['is_dir'], True)

        path2 = resource('ls_test/f_50')
        stat2 = hl.hadoop_stat(path2)
        self.assertEqual(stat2['size_bytes'], 50)
        self.assertEqual(stat2['is_dir'], False)
        self.assertTrue('path' in stat2)
Exemple #2
0
    def test_hadoop_stat(self):
        stat1 = hl.hadoop_stat(f'{BUCKET}/')
        self.assertEqual(stat1['is_dir'], True)

        stat2 = hl.hadoop_stat(f'{BUCKET}/test_out.copy.txt.gz')
        self.assertEqual(stat2['size_bytes'], 302)
        self.assertEqual(stat2['is_dir'], False)
        self.assertTrue('path' in stat2)
        self.assertTrue('owner' in stat2)
        self.assertTrue('modification_time' in stat2)
Exemple #3
0
    def test_hadoop_stat(self):
        path1 = resource('ls_test')
        stat1 = hl.hadoop_stat(path1)
        self.assertEqual(stat1['is_dir'], True)

        path2 = resource('ls_test/f_50')
        stat2 = hl.hadoop_stat(path2)
        self.assertEqual(stat2['size_bytes'], 50)
        self.assertEqual(stat2['is_dir'], False)
        self.assertTrue('path' in stat2)
        self.assertTrue('owner' in stat2)
        self.assertTrue('modification_time' in stat2)
Exemple #4
0
    def test_hadoop_stat(self, bucket=None):
        if bucket is None:
            bucket = self.remote_bucket

        stat1 = hl.hadoop_stat(f'{bucket}/')
        self.assertEqual(stat1['is_dir'], True)

        stat2 = hl.hadoop_stat(f'{bucket}/test_out.copy.txt.gz')
        self.assertEqual(stat2['size_bytes'], 302)
        self.assertEqual(stat2['is_dir'], False)
        self.assertTrue('path' in stat2)
        self.assertTrue('owner' in stat2)
        self.assertTrue('modification_time' in stat2)
Exemple #5
0
    def test_hadoop_stat(self, prefix: Optional[str] = None):
        if prefix is None:
            prefix = self.remote_tmpdir

        data = ['foo', 'bar', 'baz']
        data.extend(map(str, range(100)))
        with hadoop_open(f'{prefix}/test_hadoop_stat.txt.gz', 'w') as f:
            for d in data:
                f.write(d)
                f.write('\n')

        stat1 = hl.hadoop_stat(f'{prefix}')
        self.assertEqual(stat1['is_dir'], True)

        hadoop_copy(f'{prefix}/test_hadoop_stat.txt.gz',
                    f'{prefix}/test_hadoop_stat.copy.txt.gz')

        stat2 = hl.hadoop_stat(f'{prefix}/test_hadoop_stat.copy.txt.gz')
        # The gzip format permits metadata which makes the compressed file's size unpredictable. In
        # practice, Hadoop creates a 175 byte file and gzip.GzipFile creates a 202 byte file. The 27
        # extra bytes appear to include at least the filename (20 bytes) and a modification timestamp.
        assert stat2['size_bytes'] == 175 or stat2['size_bytes'] == 202
        self.assertEqual(stat2['is_dir'], False)
        self.assertTrue('path' in stat2)
Exemple #6
0
 def test_hadoop_copy_log(self):
     with with_local_temp_file('log') as r:
         hl.copy_log(r)
         stats = hl.hadoop_stat(r)
         self.assertTrue(stats['size_bytes'] > 0)
Exemple #7
0
 def test_hadoop_copy_log(self):
     r = resource('copy_log_test.txt')
     hl.copy_log(r)
     stats = hl.hadoop_stat(r)
     self.assertTrue(stats['size_bytes'] > 0)
 def modified_time(self, path):  # pylint: disable=no-self-use
     stat = hl.hadoop_stat(path)
     return datetime.datetime.strptime(stat["modification_time"],
                                       "%a %b %d %H:%M:%S %Z %Y")
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()
    #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df()

    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument(
        "-b",
        "--rnaseq-batch-name",
        nargs="*",
        help="RNA-seq batch names to process (eg. -b batch1 batch2)",
        choices=set(rnaseq_sample_metadata_df['star_pipeline_batch'])
        | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"]))
    grp.add_argument(
        "-s",
        "--rnaseq-sample-id",
        nargs="*",
        help="RNA-seq sample IDs to process (eg. -s sample1 sample2)",
        choices=set(rnaseq_sample_metadata_df['sample_id']) | set([
            'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E',
            'GTEX-1KXAM-0005-SM-DIPEC'
        ]))
    args = p.parse_args()

    # Generate samples_df with these columns: sample_id, bam_path, bai_path, output_dir, batch_name, sex, RIN, ancestry, etc.
    samples_df = pd.DataFrame()
    if args.rnaseq_batch_name:
        for batch_name in args.rnaseq_batch_name:
            df = rnaseq_sample_metadata_df[
                rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name]
            samples_df = transfer_metadata_columns_from_df(samples_df, df)

    elif args.rnaseq_sample_id:
        df = rnaseq_sample_metadata_df[
            rnaseq_sample_metadata_df.sample_id.isin(set(
                args.rnaseq_sample_id))]
        samples_df = transfer_metadata_columns_from_df(samples_df, df)
    else:
        p.error("Must specify -b or -s")

    logger.info(
        f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
    )

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    with batch_utils.run_batch(args) as batch:
        for sample_id in samples_df.sample_id:
            metadata_row = samples_df.loc[sample_id]

            # set job inputs & outputs
            input_bam, input_bai = metadata_row['bam_path'], metadata_row[
                'bai_path']
            output_dir = metadata_row['output_dir']

            print("Input bam: ", input_bam)
            output_filename = f"{sample_id}.bigWig"
            output_file_path = os.path.join(output_dir, output_filename)

            # check if output file already exists
            if hl.hadoop_is_file(output_file_path) and not args.force:
                logger.info(
                    f"{sample_id} output file already exists: {output_file_path}. Skipping..."
                )
                continue

            file_stats = hl.hadoop_stat(metadata_row['bam_path'])
            bam_size = int(round(file_stats['size_bytes'] / 10.**9))
            disk_size = bam_size * 2

            j = batch_utils.init_job(batch,
                                     f"bam=>bigWig: {sample_id}",
                                     cpu=args.cpu,
                                     memory=args.memory,
                                     disk_size=disk_size,
                                     image=DOCKER_IMAGE)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)

            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/references/GRCh38.chrsizes ."
            )
            j.command(f"touch {sample_id}.bam.bai")

            j.command(f"pwd && ls && date")

            j.command(
                f"python3 /src/bam2coverage.py {sample_id}.bam GRCh38.chrsizes {sample_id}"
            )
            j.command(f"cp {output_filename} {j.output_bigWig}")
            j.command(f"echo Done: {output_file_path}")
            j.command(f"date")

            # copy output
            batch.write_output(j.output_bigWig, output_file_path)

            print("Output file path: ", output_file_path)
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()

    p = argparse.ArgumentParser()
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("--local", action="store_true", help="Batch: run locally")
    grp.add_argument("--cluster",
                     action="store_true",
                     help="Batch: submit to cluster")
    p.add_argument(
        "--batch-billing-project",
        default="tgg-rare-disease",
        help="Batch: billing project. Required if submitting to cluster.")
    p.add_argument("--batch-job-name", help="Batch: (optional) job name")

    p.add_argument(
        "-f",
        "--force",
        action="store_true",
        help="Recompute and overwrite cached or previously computed data")
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("-b",
                     "--rnaseq-batch-name",
                     nargs="*",
                     help="RNA-seq batch names to process",
                     choices=set(
                         rnaseq_sample_metadata_df['star_pipeline_batch']))
    grp.add_argument("-s",
                     "--rnaseq-sample-id",
                     nargs="*",
                     help="RNA-seq sample IDs to process",
                     choices=set(rnaseq_sample_metadata_df['sample_id']))
    args = p.parse_args()

    #logger.info("\n".join(df.columns))

    if args.rnaseq_batch_name:
        batch_names = args.rnaseq_batch_name
        sample_ids = rnaseq_sample_metadata_df[rnaseq_sample_metadata_df[
            'star_pipeline_batch'].isin(batch_names)].sample_id
    elif args.rnaseq_sample_id:
        sample_ids = args.rnaseq_sample_id

    logger.info(
        f"Processing {len(sample_ids)} sample ids: {', '.join(sample_ids)}")

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    if args.local:
        backend = hb.LocalBackend(gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    else:
        backend = hb.ServiceBackend(args.batch_billing_project)

    b = hb.Batch(backend=backend, name=args.batch_job_name)

    # define workflow inputs
    if args.local:
        genes_gtf = b.read_input("gencode.v26.annotation.gff3",
                                 extension=".gff3")
    else:
        genes_gtf = b.read_input(
            "gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.GRCh38.gff3",
            extension=".gff3")

    # define parallel execution for samples
    for sample_id in sample_ids:
        metadata_row = rnaseq_sample_metadata_df.loc[sample_id]
        batch_name = metadata_row['star_pipeline_batch']

        # set job inputs & outputs
        input_read_data = b.read_input_group(
            bam=metadata_row['star_bam'],
            bai=metadata_row['star_bai'],
        )

        output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/majiq_build/"
        output_file_path = os.path.join(output_dir,
                                        f"majiq_build_{sample_id}.tar.gz")

        # check if output file already exists
        if hl.hadoop_is_file(output_file_path) and not args.force:
            logger.info(
                f"{sample_id} output file already exists: {output_file_path}. Skipping..."
            )
            continue

        file_stats = hl.hadoop_stat(metadata_row['star_bam'])
        bam_size = int(round(file_stats['size_bytes'] / 10.**9))

        # define majiq build commands for this sample
        j = b.new_job(name=args.batch_job_name)
        j.image("weisburd/majiq:latest")
        j.storage(f'{bam_size*3}Gi')
        j.cpu(1)  # default: 1
        j.memory("15G")  # default: 3.75G
        logger.info(
            f'Requesting: {j._storage or "default"} storage, {j._cpu or "default"} CPU, {j._memory or "default"} memory'
        )

        # switch to user account
        j.command(
            f"gcloud auth activate-service-account --key-file /gsa-key/key.json"
        )
        j.command(
            f"gsutil -m cp -r {GCLOUD_CREDENTIALS_LOCATION}/.config /tmp/")
        j.command(f"rm -rf ~/.config")
        j.command(f"mv /tmp/.config ~/")
        j.command(f"gcloud config set account {GCLOUD_USER_ACCOUNT}")
        j.command(f"gcloud config set project {GCLOUD_PROJECT}")

        # run majiq build
        #j.command(f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/GENCODE/gencode.v26.GRCh38.ERCC.genes.collapsed_only.gtf .")
        j.command(f"mv {genes_gtf} gencode.gff3")
        j.command(f"mv {input_read_data.bam} {sample_id}.bam")
        j.command(f"mv {input_read_data.bai} {sample_id}.bam.bai")

        j.command(f"echo '[info]' >> majiq_build.cfg")
        j.command(
            f"echo 'readlen={metadata_row['read length (rnaseqc)']}' >> majiq_build.cfg"
        )
        j.command(f"echo 'bamdirs=.' >> majiq_build.cfg")
        j.command(f"echo 'genome=hg38' >> majiq_build.cfg")
        j.command(
            f"echo 'strandness={'None' if metadata_row['stranded? (rnaseqc)'] == 'no' else 'reverse'}' >> majiq_build.cfg"
        )
        j.command(f"echo '[experiments]' >> majiq_build.cfg")
        j.command(f"echo '{sample_id}={sample_id}' >> majiq_build.cfg")

        j.command(f"cat majiq_build.cfg >> {j.logfile}")
        j.command(
            f"majiq build gencode.gff3 -c majiq_build.cfg -j 1 -o majiq_build_{sample_id} >> {j.logfile}"
        )

        j.command(
            f"tar czf majiq_build_{sample_id}.tar.gz majiq_build_{sample_id}")
        j.command(f"cp majiq_build_{sample_id}.tar.gz {j.output_tar_gz}")

        #j.command(f"ls -lh . >> {j.logfile}")
        #j.command(f"echo ls majiq_build_{sample_id} >> {j.logfile}")
        #j.command(f"ls -1 majiq_build_{sample_id} >> {j.logfile}")
        j.command(f"echo --- done  {output_file_path} >> {j.logfile}")

        # copy output
        b.write_output(j.output_tar_gz, output_file_path)
        b.write_output(
            j.logfile, os.path.join(output_dir,
                                    f"majiq_build_{sample_id}.log"))

    b.run()

    if isinstance(backend, hb.ServiceBackend):
        backend.close()
Exemple #11
0
def main():
    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    p.add_argument("--with-gtex",
                   help="Use GTEX controls.",
                   action="store_true")
    p.add_argument("--skip-step1",
                   action="store_true",
                   help="Skip count-split-reads step")
    p.add_argument("--skip-step2",
                   action="store_true",
                   help="Skip compute-PSI step")
    p.add_argument("--skip-step3",
                   action="store_true",
                   help="Skip compute-best-Q step")
    p.add_argument("-m1",
                   "--memory-step1",
                   type=float,
                   help="Batch: (optional) memory in gigabytes (eg. 3.75)",
                   default=3.75)
    p.add_argument("-m2",
                   "--memory-step2",
                   type=float,
                   help="Batch: (optional) memory in gigabytes (eg. 3.75)",
                   default=3.75)
    p.add_argument(
        "--metadata-tsv-path",
        default=ALL_METADATA_TSV,
        help="Table with columns: sample_id, bam_path, bai_path, batch")
    p.add_argument("batch_name",
                   nargs="+",
                   choices=ANALYSIS_BATCHES.keys(),
                   help="Name of RNA-seq batch to process")
    args = p.parse_args()

    hl.init(log="/dev/null", quiet=True)

    with hl.hadoop_open(args.metadata_tsv_path) as f:
        samples_df_unmodified = pd.read_table(f).set_index("sample_id",
                                                           drop=False)

    batch_label = f"FRASER"
    if args.with_gtex:
        batch_label += " (with GTEx)"
    batch_label += ": "
    batch_label += ','.join(args.batch_name)
    with batch_utils.run_batch(args, batch_label) as batch:

        for batch_name in args.batch_name:
            samples_df = samples_df_unmodified
            batch_dict = ANALYSIS_BATCHES[batch_name]
            batch_tissue = batch_dict['tissue']
            batch_sex = batch_dict['sex']

            sample_ids = list(batch_dict['samples'])
            if args.with_gtex:
                batch_name += "_with_GTEX"
                samples_df_filter = (samples_df.tissue == batch_tissue)
                samples_df_filter &= samples_df.sample_id.str.startswith(
                    "GTEX")
                if batch_sex == "M" or batch_sex == "F":
                    samples_df_filter &= (samples_df.sex == batch_sex)
                sample_ids += list(samples_df[samples_df_filter].sample_id)
            else:
                batch_name += "_without_GTEX"

            samples_df = samples_df.loc[sample_ids]
            byte_string = ", ".join(sorted(samples_df.sample_id)).encode()
            h = hashlib.md5(byte_string).hexdigest().upper()
            sample_set_label = f"{batch_name}__{len(samples_df.sample_id)}_samples_{h[:10]}"

            logger.info(
                f"Processing {sample_set_label}: {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
            )

            split_reads_samples = []

            split_reads_output_files = []
            split_reads_jobs = {}

            non_split_reads_output_files = []
            non_split_reads_jobs = {}

            j_extract_splice_junctions = None
            j_calculate_psi_values = None
            j_calculate_best_q = None

            # based on docs @ https://bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf
            # step 1: count spliced reads
            # step 2: count non-spliced reads at acceptors & donors of splice junctions detected in step 1
            for step in 1, 2:
                for sample_id in samples_df.sample_id:
                    metadata_row = samples_df.loc[sample_id]

                    # set job inputs & outputs
                    input_bam, input_bai = metadata_row[
                        'bam_path'], metadata_row['bai_path']
                    if "GTEX" in sample_id:
                        output_dir_for_sample_specific_data = "gs://macarthurlab-rnaseq/gtex_v8/fraser_count_rna/"
                    else:
                        output_dir_for_sample_specific_data = f"gs://macarthurlab-rnaseq/{metadata_row['batch']}/fraser_count_rna/"

                    output_dir_for_batch_specific_data = f"gs://macarthurlab-rnaseq/gagneur/fraser/results/{sample_set_label}"

                    output_file_path_splice_junctions_RDS = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"spliceJunctions_{sample_set_label}.RDS")
                    output_file_path_calculated_psi_values_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"calculatedPSIValues_{sample_set_label}.tar.gz")
                    output_file_path_calculated_best_q_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"calculatedBestQ_{sample_set_label}.tar.gz")
                    output_file_path_fraser_analysis_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"fraserAnalysis_using_PCA_{sample_set_label}.tar.gz")
                    output_file_path_fraser_analysis_results_only_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"fraserAnalysis_using_PCA_{sample_set_label}_results_only.tar.gz"
                    )

                    print("Input bam: ", input_bam)
                    if step == 1:
                        output_file_path = os.path.join(
                            output_dir_for_sample_specific_data,
                            f"fraser_count_split_reads_{sample_id}.tar.gz")
                        memory = args.memory_step1
                    elif step == 2:
                        output_file_path = os.path.join(
                            output_dir_for_batch_specific_data,
                            f"fraser_count_non_split_reads_{sample_id}__{sample_set_label}.tar.gz"
                        )
                        memory = args.memory_step2

                    if step == 1:
                        split_reads_samples.append(sample_id)
                        split_reads_output_files.append(output_file_path)
                    elif step == 2:
                        non_split_reads_output_files.append(output_file_path)

                    if (step == 1
                            and args.skip_step1) or (step == 2
                                                     and args.skip_step2):
                        continue

                    # check if output file already exists
                    if not args.force and hl.hadoop_is_file(output_file_path):
                        logger.info(
                            f"{sample_id} output file already exists: {output_file_path}. Skipping..."
                        )
                        continue

                    if not args.local:
                        file_stats = hl.hadoop_stat(metadata_row['bam_path'])
                        bam_size = int(round(file_stats['size_bytes'] /
                                             10.**9))
                        disk_size = bam_size * 2
                    else:
                        disk_size = None

                    job_label = f"Count {'split' if step == 1 else 'non-split'} reads"
                    j = batch_utils.init_job(batch,
                                             f"{job_label}: {sample_id}",
                                             cpu=args.cpu,
                                             memory=memory,
                                             disk_size=disk_size,
                                             image=DOCKER_IMAGE)
                    batch_utils.switch_gcloud_auth_to_user_account(
                        j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                        GCLOUD_PROJECT)

                    j.command(
                        f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam"
                    )
                    j.command(
                        f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai"
                    )
                    j.command(f"touch {sample_id}.bam.bai")
                    bam_path = f"{sample_id}.bam"

                    j.command(f"pwd && ls -lh && date")

                    if step == 1:
                        # count split reads
                        j.command(f"""time xvfb-run Rscript -e '
library(FRASER)
library(data.table)

sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}"))
print(sampleTable)
fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L)

getSplitReadCountsForAllSamples(fds)  # saves results to cache/
'""")
                    elif step == 2:
                        if sample_id in split_reads_jobs:
                            j.depends_on(split_reads_jobs[sample_id])
                        if j_extract_splice_junctions:
                            j.depends_on(j_extract_splice_junctions)

                        j.command(
                            f"gsutil -m cp {output_file_path_splice_junctions_RDS} ."
                        )

                        # count non-split reads
                        j.command(f"""time xvfb-run Rscript -e '
library(FRASER)
library(data.table)

spliceJunctions = readRDS("{os.path.basename(output_file_path_splice_junctions_RDS)}")

sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}"))
print(sampleTable)

fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L)

getNonSplitReadCountsForAllSamples(fds, spliceJunctions)  # saves results to cache/
'""")
                    j.command(f"ls -lh .")
                    j.command(
                        f"tar czf {os.path.basename(output_file_path)} cache")
                    j.command(
                        f"gsutil -m cp {os.path.basename(output_file_path)} {output_file_path}"
                    )

                    j.command(f"echo Done: {output_file_path}")
                    j.command(f"date")

                    print("Output file path: ", output_file_path)

                    if step == 1:
                        split_reads_jobs[sample_id] = j
                    elif step == 2:
                        non_split_reads_jobs[sample_id] = j

                if len(split_reads_output_files) == 0:
                    break

                if step == 1 and not args.skip_step1:
                    if hl.hadoop_is_file(output_file_path_splice_junctions_RDS
                                         ) and not args.force:
                        logger.info(
                            f"{output_file_path_splice_junctions_RDS} file already exists. Skipping extractSpliceJunctions step..."
                        )
                        continue

                    j_extract_splice_junctions = batch_utils.init_job(
                        batch,
                        f"{sample_set_label}: Extract splice-junctions",
                        disk_size=30,
                        memory=60,
                        image=DOCKER_IMAGE)
                    for j in split_reads_jobs.values():
                        j_extract_splice_junctions.depends_on(j)

                    extract_splice_junctions(
                        j_extract_splice_junctions, split_reads_output_files,
                        args.cpu, output_file_path_splice_junctions_RDS)

                elif step == 2 and not args.skip_step2:
                    if hl.hadoop_is_file(
                            output_file_path_calculated_psi_values_tar_gz
                    ) and not args.force:
                        logger.info(
                            f"{output_file_path_calculated_psi_values_tar_gz} file already exists. Skipping calculatePSIValues step..."
                        )
                        continue

                    num_cpu = 4 if args.local else 16
                    memory = 60
                    j_calculate_psi_values = batch_utils.init_job(
                        batch,
                        f"{sample_set_label}: Calculate PSI values",
                        disk_size=50,
                        cpu=num_cpu,
                        memory=memory,
                        image=DOCKER_IMAGE)
                    if j_extract_splice_junctions:
                        j_calculate_psi_values.depends_on(
                            j_extract_splice_junctions)
                    for j in non_split_reads_jobs.values():
                        j_calculate_psi_values.depends_on(j)

                    calculate_psi_values(
                        j_calculate_psi_values, sample_set_label,
                        split_reads_output_files, non_split_reads_output_files,
                        output_file_path_splice_junctions_RDS,
                        args.metadata_tsv_path, num_cpu,
                        output_file_path_calculated_psi_values_tar_gz)

            # compute Best Q
            if args.skip_step3:
                logger.info(f"Skipping calculatedBestQ step...")
            elif hl.hadoop_is_file(output_file_path_calculated_best_q_tar_gz
                                   ) and not args.force:
                logger.info(
                    f"{output_file_path_calculated_best_q_tar_gz} file already exists. Skipping calculatedBestQ step..."
                )
            else:
                num_cpu = 4 if args.local else 16
                memory = 3.75 * num_cpu

                j_calculate_best_q = batch_utils.init_job(
                    batch,
                    f"{sample_set_label}: Calculate Best Q",
                    disk_size=50,
                    cpu=num_cpu,
                    memory=memory,
                    image=DOCKER_IMAGE)

                if j_calculate_psi_values:
                    j_calculate_best_q.depends_on(j_calculate_psi_values)

                calculate_best_q(
                    j_calculate_best_q, sample_set_label, 4,
                    output_file_path_calculated_psi_values_tar_gz,
                    output_file_path_calculated_best_q_tar_gz)

            # output_file_path_fraser_analysis_tar_gz
            if hl.hadoop_is_file(
                    output_file_path_fraser_analysis_results_only_tar_gz
            ) and not args.force:
                logger.info(
                    f"{output_file_path_fraser_analysis_results_only_tar_gz} file already exists. Skipping run_fraser_analysis step..."
                )
            else:
                num_cpu = 4 if args.local else 16
                memory = 3.75 * num_cpu

                j_fraser_analysis = batch_utils.init_job(
                    batch,
                    f"{sample_set_label}: Run Fraser Analysis",
                    disk_size=50,
                    cpu=num_cpu,
                    memory=memory,
                    image=DOCKER_IMAGE)
                if j_calculate_best_q:
                    j_fraser_analysis.depends_on(j_calculate_best_q)

                run_fraser_analysis(
                    j_fraser_analysis, sample_set_label, 4,
                    output_file_path_calculated_best_q_tar_gz,
                    output_file_path_fraser_analysis_tar_gz,
                    output_file_path_fraser_analysis_results_only_tar_gz)
Exemple #12
0
def main():

    p, args = parse_args()

    df = pd.read_table(args.cram_and_tsv_paths_table)
    if {"sample_id", "output_bamout_bam", "output_bamout_bai", "variants_tsv_bgz"} - set(df.columns):
        p.error(f"{args.tsv_path} must contain 'sample_id', 'output_bamout_bam', 'variants_tsv_bgz' columns")

    if args.num_samples_to_process:
        if args.random:
            df = df.sample(n=args.num_samples_to_process)
        else:
            df = df.iloc[:args.num_samples_to_process]

    if args.sample_to_process:
        df = df[df.sample_id.isin(set(args.sample_to_process))]

    logging.info(f"Processing {len(df)} samples")

    # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster
    batch_utils.set_gcloud_project(GCLOUD_PROJECT)
    with open("deidentify_bamout.py", "rt") as f:
        deidentify_bamouts_script = f.read()

    # process sample(s)
    if not args.sample_to_process and not args.num_samples_to_process:
        # if processing entire table, listing all files up front ends up being faster
        existing_deidentify_output_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.bam", shell=True, encoding="UTF-8").strip().split("\n")
        existing_deidentify_output_sorted_bams = subprocess.check_output(f"gsutil -m ls {args.output_dir}/*.deidentify_output.sorted.bam", shell=True, encoding="UTF-8").strip().split("\n")

    hl.init(log="/dev/null")
    with batch_utils.run_batch(args, batch_name=f"deidentify bamouts: {len(df)} samples") as batch:
        for _, row in tqdm.tqdm(df.iterrows(), unit=" samples"):
            output_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.bam")
            output_sorted_bam_path = os.path.join(args.output_dir, f"{row.sample_id}.deidentify_output.sorted.bam")

            if args.sample_to_process or args.num_samples_to_process:
                run_deidentify = args.force or not hl.hadoop_is_file(output_bam_path)
                run_sort = run_deidentify or not hl.hadoop_is_file(output_sorted_bam_path)
            else:
                run_deidentify = args.force or output_bam_path not in existing_deidentify_output_bams
                run_sort = run_deidentify or output_sorted_bam_path not in existing_deidentify_output_sorted_bams

            if run_deidentify or run_sort:
                bamout_stat = hl.hadoop_stat(row.output_bamout_bam)
                cpu = 0.25
                if bamout_stat['size_bytes'] > 0.25 * 20_000_000_000:
                    cpu = 0.5
                if bamout_stat['size_bytes'] > 0.5 * 20_000_000_000:
                    cpu = 1
                if bamout_stat['size_bytes'] > 1 * 20_000_000_000:
                    cpu = 2

            if run_deidentify:
                j = batch_utils.init_job(batch, f"{row.sample_id} - deidentify - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu, disk_size=21*cpu)
                batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT)

                local_tsv_path = batch_utils.localize_file(j, row.variants_tsv_bgz, use_gcsfuse=True)
                local_exclude_tsv_path = batch_utils.localize_file(j, row.exclude_variants_tsv_bgz, use_gcsfuse=True)
                local_bamout_path = batch_utils.localize_file(j, row.output_bamout_bam, use_gcsfuse=True)

                batch_utils.localize_file(j, row.output_bamout_bai, use_gcsfuse=True)

                j.command(f"""echo --------------

echo "Start - time: $(date)"
df -kh

cat <<EOF > deidentify_bamout.py
{deidentify_bamouts_script}
EOF

time python3 deidentify_bamout.py -x "{local_exclude_tsv_path}" "{row.sample_id}" "{local_bamout_path}" "{local_tsv_path}"

ls -lh

gsutil -m cp "{row.sample_id}.deidentify_output.bam" {args.output_dir}/
gsutil -m cp "{row.sample_id}.deidentify_output.db"  {args.output_dir}/

echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo --------------

""")
            else:
                logger.info(f"Skipping deidentify {row.sample_id}...")

            if run_sort:
                j2 = batch_utils.init_job(batch, f"{row.sample_id} - sort - cpu:{cpu}", DOCKER_IMAGE if not args.raw else None, cpu=cpu)
                batch_utils.switch_gcloud_auth_to_user_account(j2, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT)

                if run_deidentify:
                    j2.depends_on(j)

                local_bamout_path = batch_utils.localize_file(j2, output_bam_path, use_gcsfuse=True)

                j2.command(f"""echo --------------

echo "Start - time: $(date)"
df -kh

samtools sort -o "{row.sample_id}.deidentify_output.sorted.bam" "{local_bamout_path}"
samtools index "{row.sample_id}.deidentify_output.sorted.bam"

ls -lh

gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam"      {args.output_dir}/
gsutil -m cp "{row.sample_id}.deidentify_output.sorted.bam.bai"  {args.output_dir}/

echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo --------------

""")
            elif run_sort:
                logger.info(f"Sorted output files exist (eg. {output_sorted_bam_path}). Skipping sort for {row.sample_id}...")