Exemple #1
0
    def test_hadoop_is_file(self):
        a_file = f'{BUCKET}/test_hadoop_is_file.txt'
        with hadoop_open(a_file, 'w') as f:
            f.write("HELLO WORLD")

        self.assertTrue(hl.hadoop_is_file(a_file))
        self.assertFalse(hl.hadoop_is_file(f'{BUCKET}/'))
        self.assertFalse(hl.hadoop_is_file(f'{BUCKET}/invalid-path'))
Exemple #2
0
def main():
    p = batch_utils.init_arg_parser(default_cpu=0.5, default_memory=1.75, gsa_key_file=os.path.expanduser("~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    p.add_argument("tsv_path", help="Table with header: sample_id, cram_path, crai_path")
    p.add_argument("sample_id", nargs="*", help="(optional) 1 or more sample_ids to process. If not specified, all rows in the .tsv will be processed.")
    args = p.parse_args()

    df = pd.read_table(args.tsv_path)
    if {"sample_id", "cram_path", "crai_path"} - set(df.columns):
        p.error(f"{args.tsv_path} must contain a 'sample_id', 'cram_path', 'crai_path' columns")

    if not args.force:
        hl.init(log="/dev/null", quiet=True)

    # process samples
    with batch_utils.run_batch(args, batch_name=f"extract chrM") as batch:
        for _, row in df.iterrows():
            if args.sample_id and row.sample_id not in set(args.sample_id):
                continue

            input_filename = os.path.basename(row.cram_path)
            prefix = input_filename.replace(".bam", "").replace(".cram", "")

            output_cram_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram")
            output_crai_path = os.path.join(OUTPUT_DIR, f"{prefix}.chrM.cram.crai")

            if not args.force and hl.hadoop_is_file(output_cram_path) and hl.hadoop_is_file(output_crai_path):
                logger.info(f"Output files exist (eg. {output_cram_path}). Skipping {input_filename}...")
                continue

            j = batch_utils.init_job(batch, f"chrM: {row.sample_id}", DOCKER_IMAGE if not args.raw else None, args.cpu, args.memory)
            batch_utils.switch_gcloud_auth_to_user_account(j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT, GCLOUD_PROJECT)

            # copy inputs
            REF_PATHS = batch_utils.HG38_REF_PATHS
            fasta_filename = os.path.basename(parse.urlparse(REF_PATHS.fasta).path)

            j.command(f"""set -ex
                env
                gsutil -m cp {REF_PATHS.fasta} {REF_PATHS.fai} {REF_PATHS.dict} .
                java -Xms2g -jar /gatk.jar PrintReads \
                    -R {fasta_filename} \
                    -I {row.cram_path} \
                    --read-index {row.crai_path} \
                    -L chrM \
                    --gcs-project-for-requester-pays broad-mpg-gnomad \
                    -O {prefix}.chrM.bam
                        
                samtools view -C -T {fasta_filename} {prefix}.chrM.bam > {prefix}.chrM.cram
                samtools index {prefix}.chrM.cram {prefix}.chrM.cram.crai
                
                gsutil -m cp {prefix}.chrM.cram.crai {output_crai_path}
                gsutil -m cp {prefix}.chrM.cram {output_cram_path}
            """)

            logger.info(f"Submitted {row.sample_id}: {output_cram_path}")
Exemple #3
0
    def test_hadoop_is_file(self, prefix: Optional[str] = None):
        if prefix is None:
            prefix = self.remote_tmpdir

        a_file = f'{prefix}/test_hadoop_is_file.txt'
        with hadoop_open(a_file, 'w') as f:
            f.write("HELLO WORLD")

        self.assertTrue(hl.hadoop_is_file(a_file))
        self.assertFalse(hl.hadoop_is_file(f'{prefix}/'))
        self.assertFalse(hl.hadoop_is_file(f'{prefix}/invalid-path'))
Exemple #4
0
    def test_hadoop_is_file(self, bucket=None):
        if bucket is None:
            bucket = self.remote_bucket

        a_file = f'{bucket}/test_hadoop_is_file.txt'
        with hadoop_open(a_file, 'w') as f:
            f.write("HELLO WORLD")

        self.assertTrue(hl.hadoop_is_file(a_file))
        self.assertFalse(hl.hadoop_is_file(f'{bucket}/'))
        self.assertFalse(hl.hadoop_is_file(f'{bucket}/invalid-path'))
def join_clump_hts(pop, not_pop, max_pops, high_quality=False, overwrite=False):
    r'''
    Wrapper for mwzj_hts_by_tree()
    '''
    assert not (not_pop and max_pops), '`not_pop` and `max_pops` cannot both be True'
    mt_path = get_clumping_results_path(pop=pop,
                                        not_pop=not_pop,
                                        max_pops=max_pops,
                                        high_quality=high_quality)
    if hl.hadoop_is_file(f'{mt_path}/_SUCCESS') and ~overwrite:
        print(f'\nMT already written to {mt_path}! To overwrite, use overwrite=True')
        return
    else:
        print(f'Writing MT to {mt_path}')
    pop = pop.upper() if pop is not None else None
    
    clump_results_dir = (f'{ldprune_dir}/results{"_high_quality" if high_quality else ""}/'+
                         ('max_pops' if max_pops else '{"not_" if not_pop else ""}{pop}'))
    ls = hl.hadoop_ls(f'{clump_results_dir}/*')
    all_hts = [x['path'] for x in ls if 'clump_results.ht' in x['path']]
    
    temp_dir = ('gs://ukbb-diverse-temp-30day/nb-temp/'+
                'max_pops' if max_pops else f'{"not_" if not_pop else ""}{pop}'+
                f'{"-hq" if high_quality else ""}')
    globals_for_col_key = ukb_common.PHENO_KEY_FIELDS
    mt = mwzj_hts_by_tree(all_hts=all_hts,
                         temp_dir=temp_dir,
                         globals_for_col_key=globals_for_col_key)
#    mt = resume_mwzj(temp_dir=temp_dir, # NOTE: only use if all the temp hts have been created
#                     globals_for_col_key=globals_for_col_key)

    mt.write(mt_path, overwrite=overwrite)
    def handle(self, *args, **options):
        samples = (IgvSample.objects.filter(
            individual__family__project__name__in=args
        ) if args else IgvSample.objects.all()).filter(
            file_path__startswith='gs://'
        ).prefetch_related('individual', 'individual__family__project')

        missing_counter = collections.defaultdict(int)
        guids_of_samples_with_missing_file = set()
        for sample in tqdm.tqdm(samples, unit=" samples"):
            if not hl.hadoop_is_file(sample.file_path):
                individual_id = sample.individual.individual_id
                project = sample.individual.family.project.name
                missing_counter[project] += 1
                logger.info('Individual: {}  file not found: {}'.format(individual_id, sample.file_path))
                if not options.get('dry_run'):
                    guids_of_samples_with_missing_file.add(sample.guid)

        if len(guids_of_samples_with_missing_file) > 0:
            IgvSample.bulk_update(user=None, update_json={'file_path': ''}, guid__in=guids_of_samples_with_missing_file)

        logger.info('---- DONE ----')
        logger.info('Checked {} samples'.format(len(samples)))
        if missing_counter:
            logger.info('{} files not found:'.format(sum(missing_counter.values())))
            for project_name, c in sorted(missing_counter.items(), key=lambda t: -t[1]):
                logger.info('   {} in {}'.format(c, project_name))
Exemple #7
0
def create_full_results_file(prune, overwrite=False):
    r'''
    Concatenates PRS-phentype regression results into a single table.
    '''
    reg_path_regex = prs_dir + f'prs_phen_reg.*.*.n_remove_{int(n_remove_per_sex)}.seed_*.{"" if prune else "not_"}pruned*.tsv'
    ls = hl.hadoop_ls(reg_path_regex)
    reg_paths = sorted([f['path'] for f in ls])
    df_list = []
    for reg_path in reg_paths:
        with hl.hadoop_open(reg_path) as f:
            df_list.append(pd.read_csv(f, sep='\t'))
    df = pd.concat(df_list, sort=False)
    df.insert(1, 'phen_desc',
              df.phen.astype(str).apply(lambda x: phen_dict[x][0])
              )  # add phenotype description to dataframe

    all_reg_results_path = prs_dir + f'prs_phen_reg.all_phens.n_remove_{int(n_remove_per_sex)}.{"" if prune else "not_"}pruned.tsv'
    if hl.hadoop_is_file(all_reg_results_path) and not overwrite:
        print('\n... Full PRS-phen regression results already written! ...')
        print(all_reg_results_path)
    else:
        print('\n... Writing PRS-phen regression results ...')
        print(all_reg_results_path)
        with hl.hadoop_open(all_reg_results_path, 'w') as f:
            df.to_csv(f, sep='\t', index=False)
def get_test_genotypes_mt(chrom, genotype_samples_ht_path, genotype_mt_path,
                          cases_only):
    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    #
    if chrom == 'all':
        mt = get_filtered_mt_with_x()
    else:
        mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', ))

    if status == 'cases':
        t2d_ht = hl.read_table(
            f'gs://ukbb-diverse-temp-30day/nb-scratch/t2d.ht/')
        t2d_ht = t2d_ht.filter(t2d_ht.both_sexes == 1)
        t2d_ht = t2d_ht.key_by('userId')
        mt = mt.filter_cols(hl.is_defined(t2d_ht[hl.int32(mt.s)]))

    mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))

    if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'):
        samples = mt.s.take(10)
        mt = mt.filter_cols(hl.literal(samples).contains(mt.s))
        mt = mt.key_cols_by(userId=hl.int32(mt.s))
        mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True)
    else:
        samples_ht = hl.read_table(genotype_samples_ht_path)
        samples = samples_ht.s.collect()
        mt = mt.filter_cols(hl.literal(samples).contains(mt.s))
        mt = mt.key_cols_by(userId=hl.int32(mt.s))

    mt = mt.select_entries('dosage')
    mt = mt.select_rows()
    mt = mt.select_cols()
    mt = mt.repartition(10)
    mt.write(genotype_mt_path)
def add_seqr_sample_to_locals3(sample: SeqrSample):
    parts = parse_vcf_s3_path(sample.path_to_vcf)
    local_filename = "vcfs/" + str(sample.project) + "/" + parts['filename']
    if not hl.hadoop_is_file("hdfs:///user/hdfs/" + local_filename):
        os.system('aws s3 cp ' + sample.path_to_vcf + ' .')
        os.system('hdfs dfs -put ' + parts['filename'] + ' ' + local_filename)
        os.system('rm ' + parts['filename'])
    return local_filename
Exemple #10
0
def main(args):

    hl.init(log='/tmp/hail.log')

    n_max = 5000  # maximum number of samples in subset (equal to final sample size if there are sufficient samples for each population)
    subsets_dir = f'{bucket}/ld_prune/subsets_{round(n_max/1e3)}k'

    pops_list = get_pops_list(args)
    print(f'overwrite_plink: {args.overwrite_plink}')

    for pops in pops_list:
        pops_str = '-'.join(pops)
        ht_sample_path = f'{subsets_dir}/{pops_str}/{pops_str}.ht'
        bfile_prefix = f'{subsets_dir}/{pops_str}/{pops_str}'

        master_bfile_paths = [
            f'{bfile_prefix}.{suffix}' for suffix in ['bed', 'bim', 'fam']
        ]

        if not args.overwrite_plink and all(
                map(hl.hadoop_is_file,
                    [f'{ht_sample_path}/_SUCCESS'] + master_bfile_paths)):
            continue
        else:
            print(f'\n... Starting PLINK exports for {pops_str} ...')
            mt_pop = get_mt_filtered_by_pops(
                pops=pops,
                chrom='all',  # chrom='all' includes autosomes and chrX
                entry_fields=('GT', )
            )  # default entry_fields will be 'GP', we need 'GT' for exporting to PLINK
            if hl.hadoop_is_file(f'{ht_sample_path}/_SUCCESS'):
                ht_sample = hl.read_table(ht_sample_path)
                ht_sample_ct = ht_sample.count()
                print(f'... Subset ht already exists for pops={pops_str} ...')
                print(f'\nSubset ht sample ct: {ht_sample_ct}\n\n')
            else:

                print(f'\n\n... Getting sample subset ({pops_str}) ...\n')

                ht_sample = get_subset(mt_pop=mt_pop,
                                       pop_dict=pop_dict,
                                       pops=pops,
                                       n_max=n_max)

                ht_sample_ct = ht_sample.count()
                print(f'\n\nht_sample_ct: {ht_sample_ct}\n\n')
                ht_sample = ht_sample.checkpoint(ht_sample_path)

            print(f'... Exporting to PLINK ({pops_str}) ...')
            to_plink(pops=pops,
                     subsets_dir=subsets_dir,
                     mt=mt_pop,
                     ht_sample=ht_sample,
                     bfile_path=bfile_prefix,
                     overwrite=args.overwrite_plink)
Exemple #11
0
def remap_samples(
    original_mt_path: str,
    input_mt: hl.MatrixTable,
    pedigree: hl.Table,
    inferred_sex: str,
) -> Tuple[hl.MatrixTable, hl.Table]:
    """
    Rename `s` col in the MatrixTable and inferred sex ht.

    :param original_mt_path: Path to original MatrixTable location
    :param input_mt: MatrixTable 
    :param pedigree: Pedigree file from seqr loaded as a Hail Table
    :param inferred_sex: Path to text file of inferred sexes
    :return: mt and sex ht with sample names remapped
    """
    base_path = "/".join(
        dirname(original_mt_path).split("/")[:-1]) + ("/base/projects")
    project_list = list(set(pedigree.Project_GUID.collect()))

    # Get the list of hts containing sample remapping information for each project
    remap_hts = []

    logger.info("Found %d projects that need to be remapped.", len(remap_hts))
    sex_ht = hl.import_table(inferred_sex)

    for i in project_list:
        remap = f"{base_path}/{i}/{i}_remap.tsv"
        if hl.hadoop_is_file(remap):
            remap_ht = hl.import_table(remap)
            remap_ht = remap_ht.key_by("s", "seqr_id")
            remap_hts.append(remap_ht)

    if len(remap_hts) > 0:
        ht = remap_hts[0]
        for next_ht in remap_hts[1:]:
            ht = ht.join(next_ht, how="outer")

        # If a sample has a non-missing value for seqr_id, rename it to the sample name for the mt and sex ht
        ht = ht.key_by("s")
        input_mt = input_mt.annotate_cols(seqr_id=ht[input_mt.s].seqr_id)
        input_mt = input_mt.key_cols_by(s=hl.if_else(
            hl.is_missing(input_mt.seqr_id), input_mt.s, input_mt.seqr_id))

        sex_ht = sex_ht.annotate(seqr_id=ht[sex_ht.s].seqr_id).key_by("s")
        sex_ht = sex_ht.key_by(s=hl.if_else(hl.is_missing(sex_ht.seqr_id),
                                            sex_ht.s, sex_ht.seqr_id))
    else:
        sex_ht = sex_ht.key_by("s")

    return input_mt, sex_ht
Exemple #12
0
def run_gwas(mt,
             phen: str,
             sim_name: str,
             subset_idx: int,
             param_suffix: str,
             wd: str,
             is_logreg=True):
    assert {'GT', 'dosage'}.intersection(
        mt.entry
    ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data"

    mt = mt.filter_cols(mt.subset_idx == subset_idx)
    mt = mt.filter_cols(hl.is_defined(mt[phen]))
    print(
        f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n')

    if 'dosage' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2)
    elif 'GT' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)

    gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz'

    if not hl.hadoop_is_file(gwas_path):
        gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles()

        if is_logreg:
            gwas_ht = hl.logistic_regression_rows(test='wald',
                                                  y=mt[phen],
                                                  x=gt_field,
                                                  covariates=[1],
                                                  pass_through=['EAF'])
        else:
            gwas_ht = hl.linear_regression_rows(y=mt[phen],
                                                x=gt_field,
                                                covariates=[1],
                                                pass_through=['EAF'])
        gwas_ht.select('EAF', 'beta', 'standard_error',
                       'p_value').export(gwas_path)

    else:
        print(f'GWAS already run! ({gwas_path})')
        gwas_ht = hl.import_table(gwas_path, impute=True, force=True)
        gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus),
                                   alleles=gwas_ht.alleles.replace(
                                       '\[\"', '').replace('\"\]',
                                                           '').split('\",\"'))
        gwas_ht = gwas_ht.key_by('locus', 'alleles')

    return gwas_ht
def get_adj_betas(p, pop, not_pop, max_pops, pheno_key_dict, pheno_id,
                  high_quality, hail_script):
    r'''
    Wrapper method for both PLINK clumping and SBayesR
    '''
    output_dir = (
        f'{ldprune_dir}/results{"_high_quality" if high_quality else ""}/' +
        ('max_pops' if max_pops else f'{"not_" if not_pop else ""}{pop}') +
        f'/{pheno_id}')

    clump_output_txt = f'{output_dir}/clump_results.txt'  # PLINK clump output txt file
    clump_output_ht = f'{output_dir}/clump_results.ht'  # PLINK clump output hail table

    #    sbayesr_output_txt = f'{output_dir}/sbayesr_results-test.txt' # SBayesR output txt file
    #    sbayesr_output_ht = f'{output_dir}/sbayesr_results-test.ht' # SBayesR output hail table

    overwrite = False

    clump_file_exists = hl.hadoop_is_file(f'{clump_output_ht}/_SUCCESS')
    if not clump_file_exists or overwrite:
        if clump_file_exists and overwrite:
            print(
                '\n\nWARNING: Existing results will be overwritten for {pheno_id} in {output_dir}!\n'
            )

        ss_dict = get_sumstats(p=p,
                               pop=pop,
                               not_pop=not_pop,
                               max_pops=max_pops,
                               pops=pheno_key_dict['pops'],
                               high_quality=high_quality,
                               pheno_id=pheno_id,
                               method='clump')

        run_method(p=p,
                   pop=pop,
                   not_pop=not_pop,
                   max_pops=max_pops,
                   pheno_id=pheno_id,
                   pheno_key_dict=pheno_key_dict,
                   hail_script=hail_script,
                   output_txt=clump_output_txt,
                   output_ht=clump_output_ht,
                   ss_dict=ss_dict,
                   method='clump')
    else:
        print(
            f'\n\nSkipping {pheno_id} because results ht exists and overwrite=False\n'
        )
def check_vcf_existence(participant_data: str, vcf_col: str, sample_map: str,
                        output_bucket: str) -> Dict[str, str]:
    """For each participant specified in sample_map, checks that the vcf file exists, and if so, add the sample and vcf path to a dictionary

    :param str participant_data: participant data (downloaded data tab from terra)
    :param str vcf_col: name of column that contains vcf output
    :param str sample_map: path to file of samples to subset (tab-delimited participant_id and sample)
    :param str output_bucket: path to bucket to which results should be written

    :return: dictionary of samples for which the vcf existence was confirmed (sample as key, path to vcf as value)
    :rtype: Dict[str, str]
    """

    # create file that will contain the samples with confirmed vcfs and their paths
    out_vcf = hl.hadoop_open(f"{output_bucket}/vcfs_to_combine.list", "w")

    # create participants_of_interest dictionary which will contain samples to which the results shoudl be subset
    participants_of_interest = {}
    confirmed_vcfs = {}
    with hl.hadoop_open(sample_map, "r") as f:
        next(f)
        for line in f:
            line = line.rstrip()
            items = line.split("\t")
            participant, sample = items[0:2]
            participants_of_interest[participant] = 0

    # load in data from terra
    participant_info = hl.import_table(participant_data)
    df = participant_info.to_pandas()

    # check if the sample is in participants_of_interest, check that the vcf exists, and if yes to both, add to confirmed_vcfs dictionary
    for _, row in df.iterrows():
        participant_id = row["entity:participant_id"]
        sample = row["s"]
        vcf = row[vcf_col]

        if participant_id in participants_of_interest and vcf != "":
            if hl.hadoop_is_file(vcf):
                out_vcf.write(f"{sample}\t{vcf}\n")
                confirmed_vcfs[sample] = vcf

    out_vcf.close()

    return confirmed_vcfs
def combine_all_dbs_for_chrom(args,
                              batch,
                              output_filename_prefix,
                              chrom_to_combined_db_paths,
                              chrom_to_combine_db_jobs,
                              temp_dir="./temp"):
    for chrom, combined_db_paths in chrom_to_combined_db_paths.items():
        output_filename = f"all_variants_{output_filename_prefix}.chr{chrom}.db"
        combine_db_jobs = chrom_to_combine_db_jobs[chrom]

        if not args.force and hl.hadoop_is_file(
                f"{args.output_dir}/{output_filename}"):
            logger.info(f"{output_filename} already exists. Skipping...")
            continue

        cpu = 2
        j3 = batch_utils.init_job(
            batch,
            f"combine all dbs (cpu: {cpu}): {output_filename}",
            DOCKER_IMAGE if not args.raw else None,
            cpu=cpu,
            disk_size=cpu * 21)
        batch_utils.switch_gcloud_auth_to_user_account(
            j3, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT)

        # don't use batch_utils.localize here because the command becomes too large
        j3.command("gsutil -m cp " + " ".join(combined_db_paths) + " .")
        local_input_db_paths = [os.path.basename(p) for p in combined_db_paths]

        add_command_to_combine_dbs(j3,
                                   output_filename,
                                   local_input_db_paths,
                                   select_chrom=None,
                                   set_combined_bamout_id=None,
                                   create_index=True,
                                   temp_dir=temp_dir)
        j3.command(f"gsutil -m cp {output_filename} {args.output_dir}/")

        for j2 in combine_db_jobs:
            j3.depends_on(j2)
Exemple #16
0
def get_mt(overwrite=False):
    mt_path = 'gs://ukbb-temp-30day/nbaya/ukb31063.hm3_variants.gwas_samples.mt'
    if hl.hadoop_is_file(f'{mt_path}/_SUCCESS'):
        mt = hl.read_matrix_table(mt_path)
    else:
        tmp_mt_path = mt_path.replace('nbaya/', 'nbaya/tmp-')

        def _write_tmp_mt():
            ## part 1: about an 1 hr with 30 workers (possibly starting with 10 then increasing to 30 if progress stalls)
            variants = hl.import_table(
                'gs://nbaya/hapmap3_variants.tsv.gz', force=True
            )  # download here: https://github.com/nikbaya/split/blob/master/hapmap3_variants.tsv.gz
            variants = variants.key_by(**hl.parse_variant(variants.v))
            mt = get_ukb_imputed_data(
                'all', variant_list=variants,
                entry_fields=('dosage', ))  # 'all' = autosomes only
            # print(mt.count()) # (1089172, 487409)
            # mt = mt.checkpoint(mt_path.replace('nbaya/','nbaya/tmp-'), overwrite=overwrite)
            mt.write(tmp_mt_path, _read_if_exists=True)

        def _repartition():
            ## part 2: 5 min with 100 preemptibles
            mt = hl.read_matrix_table(tmp_mt_path)
            mt = mt.repartition(1000)
            withdrawn = hl.read_table(
                'gs://ukb31063/ukb31063.withdrawn_samples.ht')
            mt = mt.anti_join_cols(withdrawn)
            # print(mt.count()) # (1089172, 487409)
            covs = hl.read_table(
                'gs://ukb31063/ukb31063.neale_gwas_covariates.both_sexes.ht')
            mt = mt.annotate_cols(**covs[mt.s])
            mt = mt.filter_cols(hl.is_defined(mt.PC1))
            # print(mt.count())  # (1089172, 361144)
            return mt.checkpoint(mt_path, overwrite=overwrite)

        _write_tmp_mt()
        mt = _repartition()

    return mt
Exemple #17
0
    def handle(self, *args, **options):
        samples = (IgvSample.objects.filter(
            individual__family__project__name__in=args
        ) if args else IgvSample.objects.all()).filter(
            file_path__startswith='gs://'
        ).prefetch_related('individual', 'individual__family__project')

        missing_counter = collections.defaultdict(int)
        guids_of_samples_with_missing_file = set()
        project_name_to_missing_paths = collections.defaultdict(list)
        for sample in tqdm.tqdm(samples, unit=" samples"):
            if not hl.hadoop_is_file(sample.file_path):
                individual_id = sample.individual.individual_id
                project_name = sample.individual.family.project.name
                missing_counter[project_name] += 1
                project_name_to_missing_paths[project_name].append((individual_id, sample.file_path))
                logger.info('Individual: {}  file not found: {}'.format(individual_id, sample.file_path))
                if not options.get('dry_run'):
                    guids_of_samples_with_missing_file.add(sample.guid)

        if len(guids_of_samples_with_missing_file) > 0:
            IgvSample.bulk_update(user=None, update_json={'file_path': ''}, guid__in=guids_of_samples_with_missing_file)

        logger.info('---- DONE ----')
        logger.info('Checked {} samples'.format(len(samples)))
        if missing_counter:
            logger.info('{} files not found:'.format(sum(missing_counter.values())))
            for project_name, c in sorted(missing_counter.items(), key=lambda t: -t[1]):
                logger.info('   {} in {}'.format(c, project_name))

            # post to slack
            if not options.get('dry_run'):
                slack_message = 'Found {} broken bam/cram path(s)\n'.format(sum(missing_counter.values()))
                for project_name, missing_paths_list in project_name_to_missing_paths.items():
                    slack_message += "\nIn project {}:\n".format(project_name)
                    slack_message += "\n".join([
                        "  {}   {}".format(individual_id, path) for individual_id, path in missing_paths_list
                    ])
                communication_utils.safe_post_to_slack(SEQR_SLACK_DATA_ALERTS_NOTIFICATION_CHANNEL, slack_message)
Exemple #18
0
def get_ref_X(ref_panel, overwrite=False):
    r'''
    Returns N_ref x M dim matrix of column-standardized genotypes of LD ref panel
    '''
    X_bm_path = f'{bucket}/{ref_panel}.X.bm'

    if overwrite or not hl.hadoop_is_file(f'{X_bm_path}/_SUCCESS'):
        mt = hl.import_plink(bed=f'{bucket}/{ref_panel}.bed',
                             bim=f'{bucket}/{ref_panel}.bim',
                             fam=f'{bucket}/{ref_panel}.fam')

        mt = mt.annotate_rows(stats=hl.agg.stats(mt.GT.n_alt_alleles()))
        mt = mt.annotate_entries(X=(mt.GT.n_alt_alleles() - mt.stats.mean) /
                                 mt.stats.stdev)

        X = BlockMatrix.from_entry_expr(mt.X)
        X = X.T

        X.write(f'{bucket}/{ref_panel}.X.bm', overwrite=True)

    X = BlockMatrix.read(X_bm_path)

    return X
Exemple #19
0
    param_suffix = f'{gt_sim_suffix}.h2_{h2}.pi_{pi}.K_{K}.seed_{seed}'
    betas_path = f'{smiles_wd}/betas.{param_suffix}.tsv.gz'
    phens_path = f'{smiles_wd}/phens.{param_suffix}.tsv.gz'

    if sim_name[:3] == 'bn_':
        mt = hl.balding_nichols_model(n_populations=n_pops,
                                      n_samples=n_sim,
                                      n_variants=n_vars,
                                      fst=fst)

        mt = mt.filter_rows(
            (hl.abs(hl.agg.mean(mt.GT.n_alt_alleles()) / 2 - 0.5) <
             0.5))  # remove invariant SNPs
        mt = mt.annotate_cols(s=hl.str(mt.sample_idx))

        if hl.hadoop_is_file(betas_path) and hl.hadoop_is_file(phens_path):
            #            betas = hl.import_table(betas_path, impute=True, force=True)
            #            betas = betas.annotate(locus = hl.parse_locus(betas.locus),
            #                                   alleles = betas.alleles.replace('\[\"','').replace('\"\]','').split('\",\"'))
            #            betas = betas.key_by('locus','alleles')

            phens = hl.import_table(phens_path,
                                    key=['s'],
                                    types={'s': hl.tstr},
                                    impute=True,
                                    force=True)

            sim_mt = mt.annotate_cols(y_binarized=phens[mt.s].y_binarized)

        else:
            sim_mt = get_sim_mt(mt=mt, h2=h2, pi=pi, K=K)
def run_method(p, pop, not_pop, max_pops, pheno_key_dict, pheno_id,
               hail_script, output_txt, output_ht, ss_dict, method):
    r'''
    Runs either PLINK clump (method = 'clump') or SBayesR (method = 'sbayesr')
    '''
    assert method in {'clump', 'sbayesr'}

    task_suffix = (f'{"not_" if not_pop else ""}{pop}'
                   if not max_pops else 'max_pops') + f'-{pheno_id}'
    # TODO: if method = 'sbayesr' check if LD matrix has already been calculated

    tasks = []

    ref_subset = '-'.join(pheno_key_dict['pops'] if max_pops else (
        [p for p in POPS if p is not pop] if not_pop else [pop]))
    print(f'Using LD reference panel of {ref_subset}')

    ## run plink clumping
    for chrom, ss_chrom in ss_dict.items():
        ## read ref ld plink files
        bfile = read_plink_input_group_chrom(p=p,
                                             method=method,
                                             subset=ref_subset,
                                             chrom=chrom)

        get_betas = p.new_job(name=f'{method}_{task_suffix}_chr{chrom}')

        # TODO: change image to include GCTB if running SBayesR?
        get_betas.cpu(1)  # plink clump cannot multithread

        get_betas.command('set -ex')

        if method == 'clump':
            get_betas.storage('5G')  # default: 5G
            #            clump_memory = -15*(chrom-1)+400 # Memory requested for PLINK clumping in MB. equation: -15*(chrom-1) + 500 is based on 400 MB for chr 1, 80 MB for chr 22
            clump_memory = 3.75  # in GB
            get_betas.memory(clump_memory)  # default: 30G
            get_betas.command(f'head {ss_chrom}')
            get_betas.command(' '.join([
                'plink',
                '--bfile',
                str(bfile),
                '--memory',
                str(clump_memory * 1000),  # memory in MB
                '--threads',
                '1',  # explicitly set threads to 1
                '--clump',
                ss_chrom,
                '--clump-field P',
                '--clump-snp-field SNP',
                '--clump-p1 1',
                '--clump-p2 1',
                '--clump-r2 0.1',
                '--clump-kb 500',
                '--output-chr M',  # necessary to code chr X as 'X' instead of '23', which isn't allowed as a contig in Hail's GRCh37 locus
                '--chr',
                str(chrom),
                '--out',
                f'{get_betas.ofile}_tmp'
            ]))
            get_betas.command(' '.join([
                'awk',
                "'{ print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12 }'",
                "OFS='\t'",
                f'{get_betas.ofile}_tmp.clumped',
                '2>',
                '/dev/null',
                '|',
                'tail -n+2',
                '|',  # don't include header
                "sed '/^[[:space:]]*$/d'",  # remove 2 empty lines created by PLINK at the end of the output file 
                '>',
                str(get_betas.ofile)
            ]))
        elif method == 'sbayesr':
            ldm_type = 'full'  # options: full, sparse
            ldm_path = f'{ldprune_dir}/subsets_50k/not_{pop}/ldm/not_{pop}.hm3.chr{chrom}.maf_gt_0.ldm.{ldm_type}'
            #            ldm_path = f'{ldprune_dir}/subsets_50k/not_{pop}/ldm/not_{pop}.hm3.chr{chrom}.maf_gt_0.chisq_5.ldm.sparse'

            if hl.hadoop_is_file(f'{ldm_path}.info') and hl.hadoop_is_file(
                    f'{ldm_path}.bin'):
                ldm = p.read_input_group(info=f'{ldm_path}.info',
                                         bin=f'{ldm_path}.bin')
            else:
                make_ldm = p.new_job(
                    name=f'make_{ldm_type}_ldm_{task_suffix}.chr{chrom}')
                make_ldm.memory('60G')
                make_ldm.command(' '.join([
                    'wget',
                    'https://cnsgenomics.com/software/gctb/download/gctb_2.0_Linux.zip',
                    '-P', '~/'
                ]))
                make_ldm.command(' '.join(
                    ['unzip', '~/gctb_2.0_Linux.zip', '-d', '~/']))
                make_ldm.command(' '.join(['ls', '-ltrR', '~/']))
                make_ldm.command(' '.join(
                    ['mv', '~/gctb_2.0_Linux/gctb', '/usr/local/bin/']))
                make_ldm.command(' '.join([
                    'plink', '--bfile',
                    str(bfile), '--maf 0.0000000001', '--make-bed', '--out',
                    f'{make_ldm.ofile}_tmp1'
                ]))
                make_ldm.command(' '.join([
                    'gctb',
                    '--bfile',
                    f'{make_ldm.ofile}_tmp1',
                    #                                            '--snp 1-1000',
                    f'--make-{ldm_type}-ldm',
                    '--out',
                    f'{make_ldm.ofile}_tmp2'
                ]))
                # TODO: use both .bin and .info files
                make_ldm.command(' '.join([
                    'mv', f'{make_ldm.ofile}_tmp2.ldm.{ldm_type}',
                    str(make_ldm.ofile)
                ]))
                p.write_output(make_ldm.ofile, ldm_path)
                ldm = make_ldm.ofile

            get_betas.declare_resource_group(
                out={
                    'log': '{root}.log',
                    'snpRes': '{root}.snpRes',
                    'parRes': '{root}.parRes',
                    'mcmcsamples.SnpEffects': '{root}.mcmcsamples.SnpEffects',
                    'mcmcsamples.Par': '{root}.mcmcsamples.Par'
                })
            get_betas.command(' '.join([
                'wget',
                'https://cnsgenomics.com/software/gctb/download/gctb_2.0_Linux.zip',
                '-P', '~/'
            ]))
            get_betas.memory('18G')
            get_betas.command(' '.join(
                ['unzip', '~/gctb_2.0_Linux.zip', '-d', '~/']))
            get_betas.command(' '.join(['ls', '-ltrR', '~/']))
            get_betas.command(' '.join(
                ['mv', '~/gctb_2.0_Linux/gctb', '/usr/local/bin/']))
            get_betas.command(' '.join([
                'gctb', '--sbayes R', '--ldm',
                str(ldm), '--pi 0.95,0.02,0.02,0.01', '--gamma 0.0,0.01,0.1,1',
                '--gwas-summary', f' <( gunzip -c {ss_chrom} | grep -v "NA" )',
                '--chain-length 10000', '--burn-in 2000', '--out-freq 10',
                '--out', f'{get_betas.out}'
            ]))
            get_betas.command(' '.join(['head', f'{get_betas.out}.snpRes']))
            get_betas.command(' '.join(
                ['mv', f'{get_betas.out}.snpRes',
                 str(get_betas.ofile)]))

        tasks.append(get_betas)

    get_betas_sink = p.new_job(name=f'{method}_sink_{task_suffix}')
    get_betas_sink.command(
        f'cat {" ".join([t.ofile for t in tasks])} > {get_betas_sink.ofile}'
    )  # this task implicitly depends on the chromosome scatter tasks
    p.write_output(get_betas_sink.ofile, output_txt)

    ## import as hail table and save
    n_threads = 8
    tsv_to_ht = p.new_job(name=f'{method}_to_ht_{task_suffix}')
    tsv_to_ht = tsv_to_ht.image(
        'gcr.io/ukbb-diversepops-neale/nbaya_hail:latest')
    tsv_to_ht.storage('1G')
    tsv_to_ht.memory('100M')
    tsv_to_ht.cpu(n_threads)
    tsv_to_ht.depends_on(get_betas_sink)
    tsv_to_ht.command('set -ex')
    tsv_to_ht.command(' '.join([
        'PYTHONPATH=$PYTHONPATH:/',
        'PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=4g --conf spark.executor.memory=24g pyspark-shell"'
    ]))
    tsv_to_ht.command(' '.join([
        'python3',
        str(hail_script),
        '--input_file',
        f'"{output_txt}"',  # output_txt must be doubly enclosed by quotes needed for files with "|" in their pheno_id
        '--tsv_to_ht',
        '--trait_type',
        f'''"{pheno_key_dict['trait_type']}"''',
        '--phenocode',
        f'''"{pheno_key_dict['phenocode']}"''',
        '--pheno_sex',
        f'''"{pheno_key_dict['pheno_sex']}"''',
        '--output_file',
        f'"{output_ht}"',  # output_ht must be doubly enclosed by quotes needed for files with "|" in their pheno_id
        '--overwrite'
    ] + (['--coding', f'''"{pheno_key_dict['coding']}"''']
         if pheno_key_dict['coding'] != '' else []) + (
             ['--modifier', f'''"{pheno_key_dict['modifier']}"''']
             if pheno_key_dict['modifier'] != '' else [])))
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()

    p = argparse.ArgumentParser()
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("--local", action="store_true", help="Batch: run locally")
    grp.add_argument("--cluster",
                     action="store_true",
                     help="Batch: submit to cluster")
    p.add_argument(
        "--batch-billing-project",
        default="tgg-rare-disease",
        help="Batch: billing project. Required if submitting to cluster.")
    p.add_argument("--batch-job-name", help="Batch: (optional) job name")

    p.add_argument(
        "-f",
        "--force",
        action="store_true",
        help="Recompute and overwrite cached or previously computed data")
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument("-b",
                     "--rnaseq-batch-name",
                     nargs="*",
                     help="RNA-seq batch names to process",
                     choices=set(
                         rnaseq_sample_metadata_df['star_pipeline_batch']))
    grp.add_argument("-s",
                     "--rnaseq-sample-id",
                     nargs="*",
                     help="RNA-seq sample IDs to process",
                     choices=set(rnaseq_sample_metadata_df['sample_id']))
    args = p.parse_args()

    #logger.info("\n".join(df.columns))

    if args.rnaseq_batch_name:
        batch_names = args.rnaseq_batch_name
        sample_ids = rnaseq_sample_metadata_df[rnaseq_sample_metadata_df[
            'star_pipeline_batch'].isin(batch_names)].sample_id
    elif args.rnaseq_sample_id:
        sample_ids = args.rnaseq_sample_id

    logger.info(
        f"Processing {len(sample_ids)} sample ids: {', '.join(sample_ids)}")

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    if args.local:
        backend = hb.LocalBackend(gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    else:
        backend = hb.ServiceBackend(args.batch_billing_project)

    b = hb.Batch(backend=backend, name=args.batch_job_name)

    # define workflow inputs
    if args.local:
        genes_gtf = b.read_input("gencode.v26.annotation.gff3",
                                 extension=".gff3")
    else:
        genes_gtf = b.read_input(
            "gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.GRCh38.gff3",
            extension=".gff3")

    # define parallel execution for samples
    for sample_id in sample_ids:
        metadata_row = rnaseq_sample_metadata_df.loc[sample_id]
        batch_name = metadata_row['star_pipeline_batch']

        # set job inputs & outputs
        input_read_data = b.read_input_group(
            bam=metadata_row['star_bam'],
            bai=metadata_row['star_bai'],
        )

        output_dir = f"gs://macarthurlab-rnaseq/{batch_name}/majiq_build/"
        output_file_path = os.path.join(output_dir,
                                        f"majiq_build_{sample_id}.tar.gz")

        # check if output file already exists
        if hl.hadoop_is_file(output_file_path) and not args.force:
            logger.info(
                f"{sample_id} output file already exists: {output_file_path}. Skipping..."
            )
            continue

        file_stats = hl.hadoop_stat(metadata_row['star_bam'])
        bam_size = int(round(file_stats['size_bytes'] / 10.**9))

        # define majiq build commands for this sample
        j = b.new_job(name=args.batch_job_name)
        j.image("weisburd/majiq:latest")
        j.storage(f'{bam_size*3}Gi')
        j.cpu(1)  # default: 1
        j.memory("15G")  # default: 3.75G
        logger.info(
            f'Requesting: {j._storage or "default"} storage, {j._cpu or "default"} CPU, {j._memory or "default"} memory'
        )

        # switch to user account
        j.command(
            f"gcloud auth activate-service-account --key-file /gsa-key/key.json"
        )
        j.command(
            f"gsutil -m cp -r {GCLOUD_CREDENTIALS_LOCATION}/.config /tmp/")
        j.command(f"rm -rf ~/.config")
        j.command(f"mv /tmp/.config ~/")
        j.command(f"gcloud config set account {GCLOUD_USER_ACCOUNT}")
        j.command(f"gcloud config set project {GCLOUD_PROJECT}")

        # run majiq build
        #j.command(f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/GENCODE/gencode.v26.GRCh38.ERCC.genes.collapsed_only.gtf .")
        j.command(f"mv {genes_gtf} gencode.gff3")
        j.command(f"mv {input_read_data.bam} {sample_id}.bam")
        j.command(f"mv {input_read_data.bai} {sample_id}.bam.bai")

        j.command(f"echo '[info]' >> majiq_build.cfg")
        j.command(
            f"echo 'readlen={metadata_row['read length (rnaseqc)']}' >> majiq_build.cfg"
        )
        j.command(f"echo 'bamdirs=.' >> majiq_build.cfg")
        j.command(f"echo 'genome=hg38' >> majiq_build.cfg")
        j.command(
            f"echo 'strandness={'None' if metadata_row['stranded? (rnaseqc)'] == 'no' else 'reverse'}' >> majiq_build.cfg"
        )
        j.command(f"echo '[experiments]' >> majiq_build.cfg")
        j.command(f"echo '{sample_id}={sample_id}' >> majiq_build.cfg")

        j.command(f"cat majiq_build.cfg >> {j.logfile}")
        j.command(
            f"majiq build gencode.gff3 -c majiq_build.cfg -j 1 -o majiq_build_{sample_id} >> {j.logfile}"
        )

        j.command(
            f"tar czf majiq_build_{sample_id}.tar.gz majiq_build_{sample_id}")
        j.command(f"cp majiq_build_{sample_id}.tar.gz {j.output_tar_gz}")

        #j.command(f"ls -lh . >> {j.logfile}")
        #j.command(f"echo ls majiq_build_{sample_id} >> {j.logfile}")
        #j.command(f"ls -1 majiq_build_{sample_id} >> {j.logfile}")
        j.command(f"echo --- done  {output_file_path} >> {j.logfile}")

        # copy output
        b.write_output(j.output_tar_gz, output_file_path)
        b.write_output(
            j.logfile, os.path.join(output_dir,
                                    f"majiq_build_{sample_id}.log"))

    b.run()

    if isinstance(backend, hb.ServiceBackend):
        backend.close()
def get_sumstats(p,
                 pop: str,
                 not_pop: bool,
                 max_pops: bool,
                 pops: list,
                 high_quality: bool,
                 pheno_id: str,
                 method: str,
                 chromosomes: list = all_chromosomes):
    r'''
    Returns a dict of per-chromosome summary statistics output files.
    '''
    assert not (not_pop
                and max_pops), '`not_pop` and `max_pops` cannot both be True'
    assert method in {'clump', 'sbayesr'}
    if max_pops and len(pops) == 1 and pop is None:
        pop = pops[
            0]  # need to set this variable in order to find column indices later
    num_pops = len(pops)
    filename = f'{pheno_id}.tsv.bgz'
    trait_type = pheno_id.split('-')[0]
    trait_category = 'quant' if trait_type in ['continuous', 'biomarkers'
                                               ] else 'binary'

    variant_manifest = p.read_input(
        f'{ldprune_dir}/variant_qc/full_variant_qc_metrics.txt.bgz')
    variant_manifest_tabix = p.read_input(
        f'{ldprune_dir}/variant_qc_tabix/full_variant_qc_metrics.txt.bgz.tbi')

    loo_6pop_dir = f'{ldprune_dir}/loo/sumstats/batch2'
    loo_6pop_ss_fname = f'{loo_6pop_dir}/{filename}'
    loo_6pop_tabix_fname = f'{loo_6pop_dir}_tabix/{filename}.tbi'

    ss_dir = f'{bucket}/sumstats_flat_files'
    ss_fname = f'{ss_dir}/{filename}'
    tabix_fname = f'{ss_dir}_tabix/{filename}.tbi'

    get_ss = p.new_job(name=f'get_ss_{pheno_id}')
    get_ss = get_ss.image('gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest')
    get_ss.storage('100M')  # default: 1G
    get_ss.cpu(1)
    bgz_fname = f'{get_ss.ofile}.bgz'
    tbi_fname = f'{get_ss.ofile}.bgz.tbi'
    get_ss.command('set -ex')
    variant_manifest_bgz = f'{get_ss.ofile}.variants.bgz'
    variant_manifest_tbi = f'{get_ss.ofile}.variants.bgz.tbi'
    get_ss.command(' '.join(['mv', variant_manifest, variant_manifest_bgz]))
    get_ss.command(' '.join(
        ['mv', variant_manifest_tabix, variant_manifest_tbi]))

    if not_pop and hl.hadoop_is_file(loo_6pop_ss_fname) and hl.hadoop_is_file(
            loo_6pop_tabix_fname
    ):  # phenotype is 6-pop and has leave-one-out sumstats generated.
        #        assert False, "don't run 6-pop LOO"
        print(
            f'Using 6-pop LOO sumstats for {pheno_id} ({"not_" if not_pop else ""}{pop})'
        )
        ss = p.read_input(loo_6pop_ss_fname)
        tabix = p.read_input(loo_6pop_tabix_fname)

        get_ss.command(' '.join([
            'mv', ss, bgz_fname
        ]))  # necessary instead of changing path extension for input files
        get_ss.command(' '.join([
            'mv', tabix, tbi_fname
        ]))  # necessary instead of changing path extension for input files
        get_ss.command('\n'.join(f'''
                tabix -h {bgz_fname} {chrom} | \\
                cut -f5,{6+POPS.index(pop)} | \\
                sed 's/pval_not_{pop}/P/g' | \\
                awk '$2!="NA" {{print}}' > {get_ss[f'ofile_{chrom}']}
                ''' for chrom in chromosomes))
    elif hl.hadoop_is_file(ss_fname) and hl.hadoop_is_file(
            tabix_fname
    ):  # this conditional block must come after checking for 6-pop LOO results
        print(f'Using {num_pops}-pop sumstats for {pheno_id} ' +
              ('({"not_" if not_pop else ""}{pop})'
               if not max_pops else '(max_pops=True)'))
        ss = p.read_input(ss_fname)
        tabix = p.read_input(tabix_fname)

        get_ss.command(' '.join([
            'mv', ss, bgz_fname
        ]))  # necessary instead of changing path extension for input files
        get_ss.command(' '.join([
            'mv', tabix, tbi_fname
        ]))  # necessary instead of changing path extension for input files

        if not_pop or (max_pops and
                       len(pops) > 1):  # if clumping on meta-analyzed sumstats
            pval_col_idx = 8 if trait_category == 'quant' else 9  # due to additional AF columns in binary traits, pvalue column location may change
            awk_arg1 = ''
            awk_arg2 = '$2!="NA"' + (
                '&& $3!="false"' if high_quality else ''
            )  # exclude pval(col 2)=NA; if high_quality: exclude high_quality(col 3)=false
            sed_arg = "-e 's/pval_meta/P/g'"
        else:  # if clumping single population results
            pval_col_idx = (4 + (
                (4 +
                 (trait_category == 'binary') + 1) if num_pops > 1 else 0) +
                            ((trait_category == 'binary') + 3) * num_pops +
                            pops.index(pop) + 1)

            low_confidence_col_idx = (
                4 +  # first 4 cols
                ((4 + (trait_category == 'binary') + 1) if num_pops > 1 else 0)
                +  # meta-analysis fields
                ((trait_category == 'binary') + 4) * num_pops
                +  # per-pop fields
                pops.index(pop) + 1)
            awk_arg1 = f', $3=${low_confidence_col_idx}'
            awk_arg2 = '$2!="NA" && $3!="true"' + (
                ' && $4!="false"' if high_quality else ''
            )  # exclude pval(col 2)=NA, low_confidence(col 3)=True; if high_quality: exclude high_quality(col 4)=False
            sed_arg = f"-e 's/pval_{pop}/P/g'"  # sed argument for replacing column name

        # TODO: If possible, consolidate the following blocks
        if high_quality:
            get_ss.command('\n'.join(f'''
                    paste <( tabix -h {bgz_fname} {chrom} | \\
                            awk '{{print $1=$1":"$2":"$3":"$4, $2=${pval_col_idx}{awk_arg1}}}' | \\
                            sed -e 's/chr:pos:ref:alt/SNP/g' {sed_arg} ) \\
                          <( tabix -h {variant_manifest_bgz} {chrom} | \\
                            awk '{{ print $9 }}' ) | \\
                    awk '{{if({awk_arg2}) print $1,$2}}' > {get_ss[f"ofile_{chrom}"]}
                    ''' for chrom in chromosomes))
        else:
            get_ss.command('\n'.join(f'''
                    tabix -h {bgz_fname} {chrom} | \\
                    awk '{{print $1=$1":"$2":"$3":"$4, $2=${pval_col_idx}{awk_arg1}}}' | \\
                    sed -e 's/chr:pos:ref:alt/SNP/g' {sed_arg} | \\
                    awk '{{if({awk_arg2}) print $1,$2}}' > {get_ss[f"ofile_{chrom}"]}
                    ''' for chrom in chromosomes))

    ss_dict = {chrom: get_ss[f'ofile_{chrom}'] for chrom in chromosomes}

    return ss_dict
Exemple #23
0
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()
    #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df()

    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument(
        "-b",
        "--rnaseq-batch-name",
        nargs="*",
        help="RNA-seq batch names to process (eg. -b batch1 batch2)",
        choices=set(rnaseq_sample_metadata_df['star_pipeline_batch'])
        | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"]))
    grp.add_argument(
        "-s",
        "--rnaseq-sample-id",
        nargs="*",
        help="RNA-seq sample IDs to process (eg. -s sample1 sample2)",
        choices=set(rnaseq_sample_metadata_df['sample_id']) | set([
            'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E',
            'GTEX-1KXAM-0005-SM-DIPEC'
        ]))
    args = p.parse_args()

    # Generate samples_df with these columns: sample_id, star_SJ_out_tab, output_dir, batch_name
    samples_df = pd.DataFrame()
    if args.rnaseq_batch_name:
        for batch_name in args.rnaseq_batch_name:
            df = rnaseq_sample_metadata_df[
                rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name]
            samples_df = transfer_metadata_columns_from_df(samples_df, df)

    elif args.rnaseq_sample_id:
        df = rnaseq_sample_metadata_df[
            rnaseq_sample_metadata_df.sample_id.isin(set(
                args.rnaseq_sample_id))]
        samples_df = transfer_metadata_columns_from_df(samples_df, df)
    else:
        p.error("Must specify -b or -s")

    logger.info(
        f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
    )

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    with batch_utils.run_batch(args) as batch:
        for sample_id in samples_df.sample_id:
            metadata_row = samples_df.loc[sample_id]

            # set job inputs & outputs
            output_dir = metadata_row['output_dir']

            print("Input file: ", metadata_row['star_SJ_out_tab'])
            output_filename = f"{sample_id}.junctions.bed.gz"
            output_bed_gz_file_path = os.path.join(output_dir, output_filename)

            # check if output file already exists
            if hl.hadoop_is_file(output_bed_gz_file_path) and not args.force:
                logger.info(
                    f"{sample_id} output file already exists: {output_bed_gz_file_path}. Skipping..."
                )
                continue

            j = batch_utils.init_job(batch,
                                     name=f"tab=>bed: {sample_id}",
                                     cpu=args.cpu,
                                     memory=args.memory,
                                     disk_size=5,
                                     image=DOCKER_IMAGE)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)

            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {metadata_row['star_SJ_out_tab']} ."
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp gs://macarthurlab-rnaseq/ref/gencode.v26.annotation.gff3.gz ."
            )
            j.command(f"pwd && ls && date")

            j.command(
                f"python3 /convert_SJ_out_tab_to_junctions_bed.py -g gencode.v26.annotation.gff3.gz {os.path.basename(metadata_row['star_SJ_out_tab'])}"
            )
            j.command(f"cp {output_filename} {j.output_bed_gz}")
            j.command(f"cp {output_filename}.tbi {j.output_bed_gz_tbi}")
            j.command(f"echo Done: {output_bed_gz_file_path}")
            j.command(f"date")

            # copy output
            batch.write_output(j.output_bed_gz, output_bed_gz_file_path)
            batch.write_output(j.output_bed_gz_tbi,
                               f"{output_bed_gz_file_path}.tbi")

            print("Output file path: ", output_bed_gz_file_path)
Exemple #24
0
def main():
    p, args = parse_args()

    df = pd.read_table(args.cram_and_tsv_paths_table)
    if {"sample_id", "cram_path", "crai_path", "variants_tsv_bgz"} - set(
            df.columns):
        p.error(
            f"{args.tsv_path} must contain 'sample_id', 'cram_path' columns")

    # check that all buckets are in "US-CENTRAL1" or are multi-regional to avoid egress charges to the Batch cluster
    batch_utils.set_gcloud_project(GCLOUD_PROJECT)
    if args.cluster:
        batch_utils.check_storage_bucket_region(df.cram_path)

    if not args.force:
        hl.init(log="/dev/null", quiet=True)

    # process samples
    with batch_utils.run_batch(args,
                               batch_name=f"HaplotypeCaller -bamout") as batch:
        counter = 0
        for _, row in tqdm.tqdm(df.iterrows(), unit=" rows", total=len(df)):
            if args.sample_to_process and row.sample_id not in set(
                    args.sample_to_process):
                continue

            input_filename = os.path.basename(row.cram_path)
            output_prefix = input_filename.replace(".bam",
                                                   "").replace(".cram", "")

            output_bam_path = os.path.join(args.output_dir,
                                           f"{output_prefix}.bamout.bam")
            output_bai_path = os.path.join(args.output_dir,
                                           f"{output_prefix}.bamout.bai")

            if not args.force and hl.hadoop_is_file(
                    output_bam_path) and hl.hadoop_is_file(output_bai_path):
                logger.info(
                    f"Output files exist (eg. {output_bam_path}). Skipping {input_filename}..."
                )
                continue

            counter += 1
            if args.num_samples_to_process and counter > args.num_samples_to_process:
                break

            j = batch_utils.init_job(batch, f"readviz: {row.sample_id}",
                                     DOCKER_IMAGE if not args.raw else None,
                                     args.cpu, args.memory)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT)

            local_exclude_intervals = batch_utils.localize_file(
                j, EXCLUDE_INTERVALS)
            local_fasta = batch_utils.localize_file(
                j, batch_utils.HG38_REF_PATHS.fasta, use_gcsfuse=True)
            local_fasta_fai = batch_utils.localize_file(
                j, batch_utils.HG38_REF_PATHS.fai, use_gcsfuse=True)
            batch_utils.localize_file(j,
                                      batch_utils.HG38_REF_PATHS.dict,
                                      use_gcsfuse=True)
            local_tsv_bgz = batch_utils.localize_file(j, row.variants_tsv_bgz)
            local_cram_path = batch_utils.localize_file(j, row.cram_path)
            local_crai_path = batch_utils.localize_file(j, row.crai_path)

            j.command(f"""echo --------------

echo "Start - time: $(date)"
df -kh


# 1) Convert variants_tsv_bgz to sorted interval list

gunzip -c "{local_tsv_bgz}" | awk '{{ OFS="\t" }} {{ print( "chr"$1, $2, $2 ) }}' | bedtools slop -b {PADDING_AROUND_VARIANT} -g {local_fasta_fai} > variant_windows.bed

# Sort the .bed file so that chromosomes are in the same order as in the input_cram file.
# Without this, if the input_cram has a different chromosome ordering (eg. chr1, chr10, .. vs. chr1, chr2, ..)
# than the interval list passed to GATK tools' -L arg, then GATK may silently skip some of regions in the -L intervals.
# The sort is done by first retrieving the input_cram header and passing it to GATK BedToIntervalList.

java -Xms2g -jar /gatk/gatk.jar PrintReadsHeader \
	--gcs-project-for-requester-pays {GCLOUD_PROJECT} \
	-R {local_fasta} \
	-I "{local_cram_path}" \
	-O header.bam

java -Xms2g -jar /gatk/gatk.jar BedToIntervalList \
	--SORT true \
	--SEQUENCE_DICTIONARY header.bam \
	--INPUT variant_windows.bed \
	--OUTPUT variant_windows.interval_list

# 2) Get reads from the input_cram for the intervals in variant_windows.interval_list

time java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+DisableAttachMechanism -XX:MaxHeapSize=2000m -Xmx30000m \
	-jar /gatk/GATK35.jar \
	-T HaplotypeCaller \
	-R {local_fasta} \
	-I "{local_cram_path}" \
	-L variant_windows.interval_list \
	-XL {local_exclude_intervals} \
	--disable_auto_index_creation_and_locking_when_reading_rods \
	-ERC GVCF \
	--max_alternate_alleles 3 \
	-variant_index_parameter 128000 \
	-variant_index_type LINEAR \
	--read_filter OverclippedRead \
	-bamout "{output_prefix}.bamout.bam" \
	-o "{output_prefix}.gvcf"  |& grep -v "^DEBUG"

bgzip "{output_prefix}.gvcf"
tabix "{output_prefix}.gvcf.gz"

gsutil -m cp "{output_prefix}.bamout.bam" {args.output_dir}
gsutil -m cp "{output_prefix}.bamout.bai" {args.output_dir}
gsutil -m cp "{output_prefix}.gvcf.gz" {args.output_dir}
gsutil -m cp "{output_prefix}.gvcf.gz.tbi" {args.output_dir}

ls -lh
echo --------------; free -h; df -kh; uptime; set +xe; echo "Done - time: $(date)"; echo --------------

""")
Exemple #25
0
 def test_hadoop_is_file(self):
     self.assertTrue(hl.hadoop_is_file(resource('ls_test/f_50')))
     self.assertFalse(hl.hadoop_is_file(resource('ls_test/subdir')))
     self.assertFalse(hl.hadoop_is_file(resource('ls_test/invalid-path')))
Exemple #26
0
    os.system(command)


#grouped_gcnv_cluster_to_sample_bed_paths

#%%

for cluster_name, paths in sorted(
        grouped_gcnv_cluster_to_sample_bed_paths.items(),
        key=lambda t: len(t[1])):  #, reverse=True):
    print(f"Processing {cluster_name} which has {len(paths)} samples")
    #for i in range(len(paths)//250):
    #    cluster_df = None
    #    for path in tqdm.tqdm(paths[i*250:(i+1)*250], unit=" paths"):
    cluster_bed_bucket_path = f"gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v3/beds/{cluster_name}.bed.gz"
    if hl.hadoop_is_file(f"{cluster_bed_bucket_path}.tbi"):
        print(f"{cluster_bed_bucket_path} already exists. Skipping..")
        continue

    cluster_df = None
    paths.sort(key=lambda path: os.path.basename(path))
    for path in tqdm.tqdm(paths, unit=" paths"):
        sample_name = os.path.basename(path).replace("denoised_copy_ratios-",
                                                     "")
        sample_name = re.sub(".tsv$", "", sample_name)

        column_name = sample_name
        if cluster_df is not None:
            while column_name in set(cluster_df.columns):
                print(f"WARNING: Duplicate sample name: {column_name}  {path}")
                column_name += "_2"
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            'Writing qc-filtered MT filtered to external maf with to disk...')
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Burden Test ######

    logger.info('Running burden test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Add cases/controls sample annotations
    tb_sample = get_sample_meta_data()
    mt = (mt.annotate_cols(**tb_sample[mt.s]))

    mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control']))

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # print('Number of samples/variants: ')
    # print(mt.count())

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(mt.GT.is_het()) >= 2,
        homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) |
        (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist())
    mts = []

    if args.homs:
        # select homs genotypes.

        mt_homs = (mt_grouped.select_entries(
            mac=mt_grouped.homs).annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # select compound hets (chets) genotypes.
        mt_chets = (mt_grouped.select_entries(
            mac=mt_grouped.chets).annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # select chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.select_entries(
            mac=mt_grouped.homs_chets).annotate_rows(
                agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # select hets genotypes
        mt_hets = (mt_grouped.select_entries(
            mac=mt_grouped.hets).annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_grouped = hl.MatrixTable.union_rows(*mts)

    # Generate table of counts
    tb_gene = (mt_grouped.annotate_rows(
        n_cases=hl.agg.filter(mt_grouped['phe.is_case'],
                              hl.agg.sum(mt_grouped.mac)),
        n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                  hl.agg.sum(mt_grouped.mac)),
        n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                     hl.agg.sum(mt_grouped.mac)),
        n_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                 hl.agg.sum(mt_grouped.mac)),
        n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()),
        n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                        hl.agg.count()),
        n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                           hl.agg.count()),
        n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                       hl.agg.count())).rows())

    # run fet stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []
    for proband in analysis:
        logger.info(f'Running test for {proband}...')
        colCases = None
        colTotalCases = None
        colControls = 'n_controls'
        colTotalControls = 'n_total_controls'
        if proband == 'all_cases':
            colCases = 'n_cases'
            colTotalCases = 'n_total_cases'
        if proband == 'syndromic':
            colCases = 'n_syndromic'
            colTotalCases = 'n_total_syndromic'
        if proband == 'nonsyndromic':
            colCases = 'n_nonsyndromic'
            colTotalCases = 'n_total_nonsyndromic'

        tb_fet = compute_fisher_exact(tb=tb_gene,
                                      n_cases_col=colCases,
                                      n_control_col=colControls,
                                      total_cases_col=colTotalCases,
                                      total_controls_col=colTotalControls,
                                      correct_total_counts=True,
                                      root_col_name='fet',
                                      extra_fields={
                                          'analysis': proband,
                                          'maf': maf_cutoff
                                      })

        # filter out zero-count genes
        tb_fet = (tb_fet.filter(
            hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True))

        tbs.append(tb_fet)

    tb_final = hl.Table.union(*tbs)

    tb_final.describe()

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()
Exemple #28
0
 def test_hadoop_is_file(self):
     self.assertTrue(hl.hadoop_is_file(resource('ls_test/f_50')))
     self.assertFalse(hl.hadoop_is_file(resource('ls_test/subdir')))
     self.assertFalse(hl.hadoop_is_file(resource('ls_test/invalid-path')))
def main():
    rnaseq_sample_metadata_df = get_joined_metadata_df()
    #gtex_rnaseq_sample_metadata_df = get_gtex_rnaseq_sample_metadata_df()

    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    grp = p.add_mutually_exclusive_group(required=True)
    grp.add_argument(
        "-b",
        "--rnaseq-batch-name",
        nargs="*",
        help="RNA-seq batch names to process (eg. -b batch1 batch2)",
        choices=set(rnaseq_sample_metadata_df['star_pipeline_batch'])
        | set(["gtex_muscle", "gtex_fibroblasts", "gtex_blood"]))
    grp.add_argument(
        "-s",
        "--rnaseq-sample-id",
        nargs="*",
        help="RNA-seq sample IDs to process (eg. -s sample1 sample2)",
        choices=set(rnaseq_sample_metadata_df['sample_id']) | set([
            'GTEX-1LG7Z-0005-SM-DKPQ6', 'GTEX-PX3G-0006-SM-5SI7E',
            'GTEX-1KXAM-0005-SM-DIPEC'
        ]))
    args = p.parse_args()

    # Generate samples_df with these columns: sample_id, bam_path, bai_path, output_dir, batch_name, sex, RIN, ancestry, etc.
    samples_df = pd.DataFrame()
    if args.rnaseq_batch_name:
        for batch_name in args.rnaseq_batch_name:
            df = rnaseq_sample_metadata_df[
                rnaseq_sample_metadata_df['star_pipeline_batch'] == batch_name]
            samples_df = transfer_metadata_columns_from_df(samples_df, df)

    elif args.rnaseq_sample_id:
        df = rnaseq_sample_metadata_df[
            rnaseq_sample_metadata_df.sample_id.isin(set(
                args.rnaseq_sample_id))]
        samples_df = transfer_metadata_columns_from_df(samples_df, df)
    else:
        p.error("Must specify -b or -s")

    logger.info(
        f"Processing {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
    )

    # see https://hail.zulipchat.com/#narrow/stream/223457-Batch-support/topic/auth.20as.20user.20account for more details
    with batch_utils.run_batch(args) as batch:
        for sample_id in samples_df.sample_id:
            metadata_row = samples_df.loc[sample_id]

            # set job inputs & outputs
            input_bam, input_bai = metadata_row['bam_path'], metadata_row[
                'bai_path']
            output_dir = metadata_row['output_dir']

            print("Input bam: ", input_bam)
            output_filename = f"{sample_id}.bigWig"
            output_file_path = os.path.join(output_dir, output_filename)

            # check if output file already exists
            if hl.hadoop_is_file(output_file_path) and not args.force:
                logger.info(
                    f"{sample_id} output file already exists: {output_file_path}. Skipping..."
                )
                continue

            file_stats = hl.hadoop_stat(metadata_row['bam_path'])
            bam_size = int(round(file_stats['size_bytes'] / 10.**9))
            disk_size = bam_size * 2

            j = batch_utils.init_job(batch,
                                     f"bam=>bigWig: {sample_id}",
                                     cpu=args.cpu,
                                     memory=args.memory,
                                     disk_size=disk_size,
                                     image=DOCKER_IMAGE)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)

            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai"
            )
            j.command(
                f"gsutil -u {GCLOUD_PROJECT} -m cp gs://gtex-resources/references/GRCh38.chrsizes ."
            )
            j.command(f"touch {sample_id}.bam.bai")

            j.command(f"pwd && ls && date")

            j.command(
                f"python3 /src/bam2coverage.py {sample_id}.bam GRCh38.chrsizes {sample_id}"
            )
            j.command(f"cp {output_filename} {j.output_bigWig}")
            j.command(f"echo Done: {output_file_path}")
            j.command(f"date")

            # copy output
            batch.write_output(j.output_bigWig, output_file_path)

            print("Output file path: ", output_file_path)
def main():
    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    p.add_argument(
        "--metadata-tsv-path",
        default=ALL_METADATA_TSV,
        help="Table with columns: sample_id, bam_path, bai_path, batch")
    p.add_argument("--counts-tsv-path",
                   default=ALL_COUNTS_TSV_GZ,
                   help="Counts .tsv")

    g = p.add_mutually_exclusive_group()
    g.add_argument("--with-gtex",
                   help="Use GTEX controls.",
                   action="store_true")
    g.add_argument(
        "--only-gtex",
        help="Run on just the GTEX control samples to test FP rate.",
        action="store_true")

    p.add_argument("batch_name",
                   nargs="+",
                   choices=ANALYSIS_BATCHES.keys(),
                   help="Name of RNA-seq batch to process")
    args = p.parse_args()

    if not args.force:
        hl.init(log="/dev/null", quiet=True)

    # process samples
    batch_label = f"OUTRIDER"
    if args.with_gtex:
        batch_label += " (with GTEx)"
    batch_label += ": "
    batch_label += ','.join(args.batch_name)
    with batch_utils.run_batch(args, batch_label) as batch:

        for batch_name in args.batch_name:
            batch_dict = ANALYSIS_BATCHES[batch_name]
            batch_tissue = batch_dict['tissue']
            batch_sex = batch_dict['sex']

            c_vector_of_sample_names = 'c("' + '", "'.join(
                batch_dict['samples']) + '")'
            if args.with_gtex:
                batch_include_GTEX_samples = "TRUE"
                batch_name += "_with_GTEX"
            elif args.only_gtex:
                c_vector_of_sample_names = "c()"
                batch_include_GTEX_samples = "TRUE"
                batch_name += "_only_GTEX"
            else:
                batch_include_GTEX_samples = "FALSE"
                batch_name += "_without_GTEX"

            j = batch_utils.init_job(batch,
                                     batch_name,
                                     DOCKER_IMAGE if not args.raw else None,
                                     args.cpu,
                                     args.memory,
                                     disk_size=10)
            batch_utils.switch_gcloud_auth_to_user_account(
                j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                GCLOUD_PROJECT)
            # copy inputs
            j.command(f"""gsutil -m cp {GENCODE_TXDB} .""")
            j.command(
                f"""gsutil -m cp {args.metadata_tsv_path} {args.counts_tsv_path} ."""
            )
            output_file = os.path.join(OUTPUT_BASE_DIR, f"{batch_name}.RDS")

            if not args.force and hl.hadoop_is_file(output_file):
                logger.info(
                    f"Output file exists: {output_file} . Skipping {batch_name}..."
                )
                return

            j.command(f"""time xvfb-run Rscript -e '

# outrider 
library(OUTRIDER)
library(annotables)
library(data.table)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(purrr)
library(ggrepel)
library(plotly)
library(stringr)
library(RColorBrewer)
library(ggsci)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)

possibleConfounders = c("tissue", "sex", "stranded", "read_length", "batch")    # "RIN"

# input tables generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/export_gagneur_metadata_table.py
# batches generated by ~/project__rnaseq/code/rnaseq_methods/pipelines/gagneurlab/metadata/metadata_notebook.py

sampleInfo = fread("{os.path.basename(args.metadata_tsv_path)}")
sampleInfo$read_length = as.character(sampleInfo$read_length)

GTEX_sampleIds = c()
if ({batch_include_GTEX_samples}) {{
    if (("{batch_sex}" == "M") || ("{batch_sex}" == "F")) {{
        GTEX_sampleIds = sampleInfo[(sampleInfo$sex == "{batch_sex}") & (sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id
    }} else {{
        GTEX_sampleIds = sampleInfo[(sampleInfo$tissue == "{batch_tissue}") & grepl("GTEX", sampleInfo$sample_id)]$sample_id    
    }}
}}


sampleLabel = "{batch_name}_"
sampleSubset = {c_vector_of_sample_names}
sampleSubset = c(sampleSubset, GTEX_sampleIds)
print("sampleSubset: ")
print(sampleSubset)

sampleInfo = sampleInfo[sampleInfo$sample_id %in% sampleSubset]
if (nrow(sampleInfo) != length(sampleSubset)) {{
    print(paste("ERROR: length(sampleInfo) != length(sampleSubset):", length(sampleInfo), length(sampleSubset)))
    quit("yes")
}}

geneReadCounts = fread("{os.path.basename(args.counts_tsv_path)}", select=c("gene_id", sampleSubset))
geneReadCounts = geneReadCounts[!grep("ERCC", geneReadCounts$geneId),]


geneIds = geneReadCounts$gene_id
colsMiusGeneId = colnames(geneReadCounts)[!colnames(geneReadCounts) %in% c("gene_id")]
geneReadCounts = geneReadCounts[,..colsMiusGeneId]
rownames(geneReadCounts) = geneIds

cnts = as.matrix(geneReadCounts)
rownames(cnts) = geneIds
ncol(cnts)
nrow(cnts)
if (ncol(cnts) != length(sampleSubset)) {{
    print(paste("ERROR: ncol(cnts) != length(sampleSubset):", ncol(cnts), length(sampleSubset)))
    quit("yes")
}}

sampleInfo[,sampleID:=sample_id]
ods <- OutriderDataSet(countData=cnts, colData=sampleInfo)

txdb <- loadDb("{os.path.basename(GENCODE_TXDB)}")
ods <- filterExpression(ods, gtfFile=txdb, filterGenes=FALSE)   #, fpkmCutoff=100)

g = plotFPKM(ods) + theme_bw() + theme(legend.position="bottom")
ggsave(file=paste(sampleLabel, "_plotFPKM.png", sep=""), g, device="png", type="cairo")

#plotExpressedGenes(ods)

ods <- estimateSizeFactors(ods)
sortedSizeFactors = sort(sizeFactors(ods))
g = ggplot(data=NULL, aes(y=sortedSizeFactors, x=1:ncol(ods))) + 
  geom_point(color="blue", size=1) + 
  labs(x="Sample rank", y="Size factors", title="Size factor distribution") + 
  geom_label_repel(aes(label=ifelse(sortedSizeFactors > 1.5, names(sortedSizeFactors), "")), 
                   nudge_x = -35, box.padding = 0.35, point.padding = 0.5, segment.color = "grey50") +
  geom_label_repel(aes(label=ifelse(sortedSizeFactors < 0.5, names(sortedSizeFactors), "")), 
                   nudge_x = 35, box.padding   = 0.35, point.padding = 0.5, segment.color = "grey50") +
  theme_bw()

ggsave(file=paste(sampleLabel, "_sizeFactors.png", sep=""), g, type="cairo")

print(sort(sizeFactors(ods))[1:5])

print(paste(length(ods), "genes before filtering"))
ods <- ods[mcols(ods)$passedFilter,]
print(paste(length(ods), "genes after filtering"))
plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, filename=paste(sampleLabel, "_plotCountCorHeatmap_before_correction.pdf", sep=""))
plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=FALSE, device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_before_correction.pdf", sep=""))

if (length(sampleSubset) > 5) {{
    ods = findEncodingDim(ods, BPPARAM=MulticoreParam(4, progressbar=TRUE))
    g = plotEncDimSearch(ods)
    ggsave(file=paste(sampleLabel, "_plotEncDimSearch", ".png", sep=""), g, type="cairo")
    optimal_q = metadata(ods)$opt
}} else {{
    optimal_q = length(sampleSubset)
}}

# increase / descrease by 25%

q = optimal_q
original_ods = ods

ods = OUTRIDER(original_ods, verbose=TRUE, iterations=15, q=q, BPPARAM=MulticoreParam(4, progressbar=TRUE))
saveRDS(ods, paste(sampleLabel, "_ods.RDS", sep=""))

plotCountCorHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", nRowCluster=1, nColCluster=1, main=paste("Count correlation heatmap q=", q, sep=""), filename=paste(sampleLabel, "_plotCountCorHeatmap_after_correction.pdf", sep=""))

plotCountGeneSampleHeatmap(ods, colGroups=possibleConfounders, normalized=TRUE, device="pdf", type="cairo", main=paste("Count Gene vs Sample Heatmap q=", q, sep=""), device="pdf", type="cairo", filename=paste(sampleLabel, "_plotCountGeneSampleHeatmap_after_correction.pdf", sep=""))

res = results(ods, padjCutoff=1)
res = res[,c("sampleID", "geneID", "pValue", "padjust", "zScore", "rawcounts")][order(padjust),]
res[, "q"] = q
write.table(res, file=paste(sampleLabel, "_ods__", "q", q, "_results.tsv", sep=""), quote=FALSE, sep="\\t", row.names=FALSE)
'""")

            j.command("gzip *.tsv")
            j.command(
                f"gsutil -m cp  *.tsv.gz *.pdf *.png *.RDS {OUTPUT_BASE_DIR}")

            logger.info(f"Output: {output_file}")
Exemple #31
0
def main():
    p = batch_utils.init_arg_parser(
        default_cpu=4,
        gsa_key_file=os.path.expanduser(
            "~/.config/gcloud/misc-270914-cb9992ec9b25.json"))
    p.add_argument("--with-gtex",
                   help="Use GTEX controls.",
                   action="store_true")
    p.add_argument("--skip-step1",
                   action="store_true",
                   help="Skip count-split-reads step")
    p.add_argument("--skip-step2",
                   action="store_true",
                   help="Skip compute-PSI step")
    p.add_argument("--skip-step3",
                   action="store_true",
                   help="Skip compute-best-Q step")
    p.add_argument("-m1",
                   "--memory-step1",
                   type=float,
                   help="Batch: (optional) memory in gigabytes (eg. 3.75)",
                   default=3.75)
    p.add_argument("-m2",
                   "--memory-step2",
                   type=float,
                   help="Batch: (optional) memory in gigabytes (eg. 3.75)",
                   default=3.75)
    p.add_argument(
        "--metadata-tsv-path",
        default=ALL_METADATA_TSV,
        help="Table with columns: sample_id, bam_path, bai_path, batch")
    p.add_argument("batch_name",
                   nargs="+",
                   choices=ANALYSIS_BATCHES.keys(),
                   help="Name of RNA-seq batch to process")
    args = p.parse_args()

    hl.init(log="/dev/null", quiet=True)

    with hl.hadoop_open(args.metadata_tsv_path) as f:
        samples_df_unmodified = pd.read_table(f).set_index("sample_id",
                                                           drop=False)

    batch_label = f"FRASER"
    if args.with_gtex:
        batch_label += " (with GTEx)"
    batch_label += ": "
    batch_label += ','.join(args.batch_name)
    with batch_utils.run_batch(args, batch_label) as batch:

        for batch_name in args.batch_name:
            samples_df = samples_df_unmodified
            batch_dict = ANALYSIS_BATCHES[batch_name]
            batch_tissue = batch_dict['tissue']
            batch_sex = batch_dict['sex']

            sample_ids = list(batch_dict['samples'])
            if args.with_gtex:
                batch_name += "_with_GTEX"
                samples_df_filter = (samples_df.tissue == batch_tissue)
                samples_df_filter &= samples_df.sample_id.str.startswith(
                    "GTEX")
                if batch_sex == "M" or batch_sex == "F":
                    samples_df_filter &= (samples_df.sex == batch_sex)
                sample_ids += list(samples_df[samples_df_filter].sample_id)
            else:
                batch_name += "_without_GTEX"

            samples_df = samples_df.loc[sample_ids]
            byte_string = ", ".join(sorted(samples_df.sample_id)).encode()
            h = hashlib.md5(byte_string).hexdigest().upper()
            sample_set_label = f"{batch_name}__{len(samples_df.sample_id)}_samples_{h[:10]}"

            logger.info(
                f"Processing {sample_set_label}: {len(samples_df)} sample ids: {', '.join(samples_df.sample_id[:20])}"
            )

            split_reads_samples = []

            split_reads_output_files = []
            split_reads_jobs = {}

            non_split_reads_output_files = []
            non_split_reads_jobs = {}

            j_extract_splice_junctions = None
            j_calculate_psi_values = None
            j_calculate_best_q = None

            # based on docs @ https://bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf
            # step 1: count spliced reads
            # step 2: count non-spliced reads at acceptors & donors of splice junctions detected in step 1
            for step in 1, 2:
                for sample_id in samples_df.sample_id:
                    metadata_row = samples_df.loc[sample_id]

                    # set job inputs & outputs
                    input_bam, input_bai = metadata_row[
                        'bam_path'], metadata_row['bai_path']
                    if "GTEX" in sample_id:
                        output_dir_for_sample_specific_data = "gs://macarthurlab-rnaseq/gtex_v8/fraser_count_rna/"
                    else:
                        output_dir_for_sample_specific_data = f"gs://macarthurlab-rnaseq/{metadata_row['batch']}/fraser_count_rna/"

                    output_dir_for_batch_specific_data = f"gs://macarthurlab-rnaseq/gagneur/fraser/results/{sample_set_label}"

                    output_file_path_splice_junctions_RDS = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"spliceJunctions_{sample_set_label}.RDS")
                    output_file_path_calculated_psi_values_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"calculatedPSIValues_{sample_set_label}.tar.gz")
                    output_file_path_calculated_best_q_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"calculatedBestQ_{sample_set_label}.tar.gz")
                    output_file_path_fraser_analysis_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"fraserAnalysis_using_PCA_{sample_set_label}.tar.gz")
                    output_file_path_fraser_analysis_results_only_tar_gz = os.path.join(
                        output_dir_for_batch_specific_data,
                        f"fraserAnalysis_using_PCA_{sample_set_label}_results_only.tar.gz"
                    )

                    print("Input bam: ", input_bam)
                    if step == 1:
                        output_file_path = os.path.join(
                            output_dir_for_sample_specific_data,
                            f"fraser_count_split_reads_{sample_id}.tar.gz")
                        memory = args.memory_step1
                    elif step == 2:
                        output_file_path = os.path.join(
                            output_dir_for_batch_specific_data,
                            f"fraser_count_non_split_reads_{sample_id}__{sample_set_label}.tar.gz"
                        )
                        memory = args.memory_step2

                    if step == 1:
                        split_reads_samples.append(sample_id)
                        split_reads_output_files.append(output_file_path)
                    elif step == 2:
                        non_split_reads_output_files.append(output_file_path)

                    if (step == 1
                            and args.skip_step1) or (step == 2
                                                     and args.skip_step2):
                        continue

                    # check if output file already exists
                    if not args.force and hl.hadoop_is_file(output_file_path):
                        logger.info(
                            f"{sample_id} output file already exists: {output_file_path}. Skipping..."
                        )
                        continue

                    if not args.local:
                        file_stats = hl.hadoop_stat(metadata_row['bam_path'])
                        bam_size = int(round(file_stats['size_bytes'] /
                                             10.**9))
                        disk_size = bam_size * 2
                    else:
                        disk_size = None

                    job_label = f"Count {'split' if step == 1 else 'non-split'} reads"
                    j = batch_utils.init_job(batch,
                                             f"{job_label}: {sample_id}",
                                             cpu=args.cpu,
                                             memory=memory,
                                             disk_size=disk_size,
                                             image=DOCKER_IMAGE)
                    batch_utils.switch_gcloud_auth_to_user_account(
                        j, GCLOUD_CREDENTIALS_LOCATION, GCLOUD_USER_ACCOUNT,
                        GCLOUD_PROJECT)

                    j.command(
                        f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bam} {sample_id}.bam"
                    )
                    j.command(
                        f"gsutil -u {GCLOUD_PROJECT} -m cp {input_bai} {sample_id}.bam.bai"
                    )
                    j.command(f"touch {sample_id}.bam.bai")
                    bam_path = f"{sample_id}.bam"

                    j.command(f"pwd && ls -lh && date")

                    if step == 1:
                        # count split reads
                        j.command(f"""time xvfb-run Rscript -e '
library(FRASER)
library(data.table)

sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}"))
print(sampleTable)
fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L)

getSplitReadCountsForAllSamples(fds)  # saves results to cache/
'""")
                    elif step == 2:
                        if sample_id in split_reads_jobs:
                            j.depends_on(split_reads_jobs[sample_id])
                        if j_extract_splice_junctions:
                            j.depends_on(j_extract_splice_junctions)

                        j.command(
                            f"gsutil -m cp {output_file_path_splice_junctions_RDS} ."
                        )

                        # count non-split reads
                        j.command(f"""time xvfb-run Rscript -e '
library(FRASER)
library(data.table)

spliceJunctions = readRDS("{os.path.basename(output_file_path_splice_junctions_RDS)}")

sampleTable = data.table(sampleID=c("{sample_id}"), bamFile=c("{bam_path}"))
print(sampleTable)

fds = FraserDataSet(colData=sampleTable, workingDir=".", bamParam=ScanBamParam(mapqFilter=0), strandSpecific=0L)

getNonSplitReadCountsForAllSamples(fds, spliceJunctions)  # saves results to cache/
'""")
                    j.command(f"ls -lh .")
                    j.command(
                        f"tar czf {os.path.basename(output_file_path)} cache")
                    j.command(
                        f"gsutil -m cp {os.path.basename(output_file_path)} {output_file_path}"
                    )

                    j.command(f"echo Done: {output_file_path}")
                    j.command(f"date")

                    print("Output file path: ", output_file_path)

                    if step == 1:
                        split_reads_jobs[sample_id] = j
                    elif step == 2:
                        non_split_reads_jobs[sample_id] = j

                if len(split_reads_output_files) == 0:
                    break

                if step == 1 and not args.skip_step1:
                    if hl.hadoop_is_file(output_file_path_splice_junctions_RDS
                                         ) and not args.force:
                        logger.info(
                            f"{output_file_path_splice_junctions_RDS} file already exists. Skipping extractSpliceJunctions step..."
                        )
                        continue

                    j_extract_splice_junctions = batch_utils.init_job(
                        batch,
                        f"{sample_set_label}: Extract splice-junctions",
                        disk_size=30,
                        memory=60,
                        image=DOCKER_IMAGE)
                    for j in split_reads_jobs.values():
                        j_extract_splice_junctions.depends_on(j)

                    extract_splice_junctions(
                        j_extract_splice_junctions, split_reads_output_files,
                        args.cpu, output_file_path_splice_junctions_RDS)

                elif step == 2 and not args.skip_step2:
                    if hl.hadoop_is_file(
                            output_file_path_calculated_psi_values_tar_gz
                    ) and not args.force:
                        logger.info(
                            f"{output_file_path_calculated_psi_values_tar_gz} file already exists. Skipping calculatePSIValues step..."
                        )
                        continue

                    num_cpu = 4 if args.local else 16
                    memory = 60
                    j_calculate_psi_values = batch_utils.init_job(
                        batch,
                        f"{sample_set_label}: Calculate PSI values",
                        disk_size=50,
                        cpu=num_cpu,
                        memory=memory,
                        image=DOCKER_IMAGE)
                    if j_extract_splice_junctions:
                        j_calculate_psi_values.depends_on(
                            j_extract_splice_junctions)
                    for j in non_split_reads_jobs.values():
                        j_calculate_psi_values.depends_on(j)

                    calculate_psi_values(
                        j_calculate_psi_values, sample_set_label,
                        split_reads_output_files, non_split_reads_output_files,
                        output_file_path_splice_junctions_RDS,
                        args.metadata_tsv_path, num_cpu,
                        output_file_path_calculated_psi_values_tar_gz)

            # compute Best Q
            if args.skip_step3:
                logger.info(f"Skipping calculatedBestQ step...")
            elif hl.hadoop_is_file(output_file_path_calculated_best_q_tar_gz
                                   ) and not args.force:
                logger.info(
                    f"{output_file_path_calculated_best_q_tar_gz} file already exists. Skipping calculatedBestQ step..."
                )
            else:
                num_cpu = 4 if args.local else 16
                memory = 3.75 * num_cpu

                j_calculate_best_q = batch_utils.init_job(
                    batch,
                    f"{sample_set_label}: Calculate Best Q",
                    disk_size=50,
                    cpu=num_cpu,
                    memory=memory,
                    image=DOCKER_IMAGE)

                if j_calculate_psi_values:
                    j_calculate_best_q.depends_on(j_calculate_psi_values)

                calculate_best_q(
                    j_calculate_best_q, sample_set_label, 4,
                    output_file_path_calculated_psi_values_tar_gz,
                    output_file_path_calculated_best_q_tar_gz)

            # output_file_path_fraser_analysis_tar_gz
            if hl.hadoop_is_file(
                    output_file_path_fraser_analysis_results_only_tar_gz
            ) and not args.force:
                logger.info(
                    f"{output_file_path_fraser_analysis_results_only_tar_gz} file already exists. Skipping run_fraser_analysis step..."
                )
            else:
                num_cpu = 4 if args.local else 16
                memory = 3.75 * num_cpu

                j_fraser_analysis = batch_utils.init_job(
                    batch,
                    f"{sample_set_label}: Run Fraser Analysis",
                    disk_size=50,
                    cpu=num_cpu,
                    memory=memory,
                    image=DOCKER_IMAGE)
                if j_calculate_best_q:
                    j_fraser_analysis.depends_on(j_calculate_best_q)

                run_fraser_analysis(
                    j_fraser_analysis, sample_set_label, 4,
                    output_file_path_calculated_best_q_tar_gz,
                    output_file_path_fraser_analysis_tar_gz,
                    output_file_path_fraser_analysis_results_only_tar_gz)