def main(): cnf = read_opts_and_cnfs( description='Plotting Seq2C results.', extra_opts=[ (['--seq2c-results'], dict( dest='seq2c_tsv_fpath') ), (['--key-genes'], dict( dest='key_genes_fpath') ), ], required_keys=['seq2c_tsv_fpath', 'output_dir'], file_keys=['seq2c_tsv_fpath', 'key_genes'], key_for_sample_name=None, ) check_system_resources(cnf) check_genome_resources(cnf) key_gene_names = None if cnf.key_genes_fpath: with open(cnf.key_genes_fpath) as f: key_gene_names = set([l.strip() for l in f.readlines() if l.strip() != '']) plot_fpath = draw_seq2c_plot(cnf, cnf.seq2c_tsv_fpath, cnf.sample, cnf.output_dir, key_gene_names) if plot_fpath: info('Saved plot to ' + plot_fpath)
def main(args): cnf = read_opts_and_cnfs( extra_opts=[ (['--vcf', '--var'], dict( dest='vcf', help='variants to annotate') ), (['--bam'], dict( dest='bam', help='(outdated) used to generate some annotations by GATK') ), (['--match-normal-sample-name'], dict( dest='match_normal_normal_name') ), (['--clinical-reporting'], dict( dest='clinical_reporting', help='used to generate some annotations by GATK', action='store_true', default=None) ), (['--qc'], dict( dest='qc', action='store_true', default=True, help=SUPPRESS_HELP) ), (['--no-qc'], dict( dest='qc', action='store_false', help=SUPPRESS_HELP) ), ], required_keys=['vcf'], file_keys=['bam', 'vcf'], key_for_sample_name='vcf', proc_name=source.varannotate_name) check_system_resources(cnf, required=['java', 'perl', 'snpeff'], optional=['transcripts_fpath']) check_genome_resources(cnf) # info('Using variants ' + cnf['vcf']) # info('Using alignement ' + cnf['bam']) run_one(cnf, process_one, finalize_one) if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir'])
def main(): cnf = read_opts_and_cnfs( extra_opts=[ (['--targqc-dir'], dict(dest='targqc_dirpath', )), (['--mutations'], dict(dest='mutations_fpath', )), (['--sv'], dict(dest='sv_fpath', )), (['--sv-vcf'], dict(dest='sv_vcf_fpath', )), (['--varqc'], dict(dest='varqc_json_fpath', )), (['--varqc-after'], dict(dest='varqc_after_json_fpath', )), (['--target-type'], dict( dest='target_type', default='panel', )), (['--bed'], dict(dest='bed_fpath', )), (['--seq2c'], dict(dest='seq2c_tsv_fpath', )), (['--project-level-report'], dict(dest='project_report_path', )), (['--targqc-html'], dict(dest='targqc_report_path', )), (['--match'], dict(dest='match_sample_name', )), (['--jira'], dict(dest='jira_url', )), ], key_for_sample_name=None, required_keys=[], file_keys=[ 'mutations_fpath', 'varqc_json_fpath', 'varqc_after_json_fpath', 'bed_fpath', 'seq2c_tsv_fpath', 'sv_fpath', 'sv_vcf_fpath', #'project_report_path', # DO NOT UNCOMMENT! Project level report might not yet exist ], # do not check mutations_fpath! could be either of: # vardict.PASS.txt, # vardict-java.PASS.txt, # vardict.single.PASS.txt, # vardict.paired.PASS.txt, dir_keys=['targqc_dirpath'], ) check_genome_resources(cnf) check_system_resources(cnf, required=['bedtools'], optional=[]) clin_info = clinical_sample_info_from_cnf(cnf) html_fpath = make_clinical_report(cnf, clin_info, clin_info.sample.clinical_html) info('Clinical report: ' + html_fpath) if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir'])
def main(): cnf = read_opts_and_cnfs(extra_opts=[ (['--bam'], dict(dest='bam', help='path to the BAM file')), (['--bed', '--capture', '--amplicons'], dict(dest='bed', help='capture panel/amplicons')), (['--pcr'], dict( dest='pcr', action='store_true', help='deduplication was not perfomed, thus do not try to dedup')), ], required_keys=['bam'], file_keys=['bam', 'bed'], key_for_sample_name='bam', proc_name=BCBioStructure.qualimap_name) index_bam(cnf, cnf.bam) info('Using alignment ' + cnf.bam) bed = '' if cnf.bed: bed = ' -gff ' + cnf.bed + ' ' info('Using amplicons/capture panel ' + cnf.bed) qualimap = get_system_path(cnf, 'qualimap', is_critical=True) if not qualimap: critical('Cannot find qualimap') info() mem_cmdl = '' mem_m = get_qualimap_max_mem(cnf.bam) mem = str(int(mem_m)) + 'M' mem_cmdl = ' --java-mem-size=' + mem cmdline = ( '{qualimap} bamqc --skip-duplicated -nt ' + str(cnf.threads) + mem_cmdl + ' -nr 5000 ' '-bam {cnf.bam} -outdir {cnf.output_dir} {bed} -c -gd HUMAN').format( **locals()) report_fpath = join(cnf.output_dir, 'qualimapReport.html') call(cnf, cmdline, output_fpath=report_fpath, stdout_to_outputfile=False, env_vars=dict(DISPLAY=None)) info('Qualimap report: ' + str(report_fpath))
def proc_args(argv): cnf = read_opts_and_cnfs( extra_opts=[ (['--bam'], dict(dest='bam', )), ], required_keys=['bam'], file_keys=['bam'], ) check_genome_resources(cnf) if not cnf.bam: critical('No bam file provided to input') if not cnf.genome: critical('Please, specify the --genome option (e.g. --genome hg19)') return cnf
def main(args): cnf = read_opts_and_cnfs( extra_opts=[ (['--var', '--vcf'], dict( dest='vcf', help='variants to evaluate') ), ], required_keys=['vcf'], file_keys=['vcf'], key_for_sample_name='vcf', proc_name=source.varqc_name, ) check_system_resources(cnf) check_genome_resources(cnf) info('Using variants ' + cnf['vcf']) run_one(cnf, process_one, finalize_one) if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir'])
def main(args): cnf = read_opts_and_cnfs(extra_opts=[ (['--bam'], dict(dest='bam', help='a path to the BAM file to study')), (['-1'], dict(dest='l_fpath')), (['-2'], dict(dest='r_fpath')), (['--bed', '--capture', '--amplicons'], dict(dest='bed', help='a BED file for capture panel or amplicons')), (['--exons', '--exome', '--features'], dict( dest='features', help= 'a BED file with real CDS/Exon/Gene/Transcript regions with annotations (default "features" is in system_config)' )), (['--exons-no-genes', '--features-no-genes'], dict( dest='features_no_genes', help= 'a BED file with real CDS/Exon regions with annotations, w/o Gene/Transcript records (default "features" is in system_config)' )), (['--original-bed'], dict(dest='original_target_bed', help=SUPPRESS_HELP)), (['--original-exons', '--original-features'], dict( dest='original_features_bed', help='original features genes bed file path (just for reporting)') ), (['--reannotate'], dict(dest='reannotate', help='re-annotate BED file with gene names', action='store_true', default=False)), (['--no-prep-bed'], dict(dest='prep_bed', help='do not fix input beds and exons', action='store_false', default=True)), (['-e', '--extended'], dict(dest='extended', help='extended - flagged regions and missed variants', action='store_true', default=False)), (['--genes'], dict(dest='genes', help='custom list of genes')), (['--padding'], dict( dest='padding', help= 'integer indicating the number of bases to extend each target region up and down-stream. ' 'Default is ' + str(defaults['coverage_reports']['padding']), type='int')), (['--no-dedup'], dict(dest='no_dedup', action='store_true', help=SUPPRESS_HELP)), (['--downsample-to'], dict(dest='downsample_to', type='int', help=SUPPRESS_HELP)), (['--downsampled'], dict(dest='downsampled', action='store_true', help=SUPPRESS_HELP)), (['--fastqc-dirpath'], dict(dest='fastqc_dirpath', help=SUPPRESS_HELP)) ], file_keys=['bam', 'l_fpath', 'r_fpath', 'bed'], key_for_sample_name='bam') if cnf.padding: cnf.coverage_reports.padding = cnf.padding check_system_resources(cnf, required=['bedtools'], optional=[]) check_genome_resources(cnf) features_bed = adjust_path(cnf.features) if cnf.features else adjust_path( cnf.genome.features) if features_bed: info('Features: ' + features_bed) features_bed = verify_file(features_bed) else: info('No features BED found') if cnf.bed: cnf.bed = verify_file(cnf.bed, is_critical=True) info('Using amplicons/capture panel ' + cnf.bed) elif features_bed: info('WGS, taking CDS as target') cnf.bam = verify_bam(cnf.bam, is_critical=True) reports = process_one(cnf, cnf.output_dir, cnf.bam, features_bed=features_bed, features_no_genes_bed=cnf.features_no_genes) summary_report, gene_report = reports[:2] info('') info('*' * 70) if summary_report.txt_fpath: info('Summary report: ' + summary_report.txt_fpath) if gene_report: if gene_report.txt_fpath: info('All regions: ' + gene_report.txt_fpath + ' (' + str(len(gene_report.rows)) + ' regions)') if len(reports) > 2: selected_regions_report = reports[2] if selected_regions_report.txt_fpath: info('Flagged regions: ' + selected_regions_report.txt_fpath + ' (' + str(len(selected_regions_report.rows)) + ' regions)') for fpaths in reports: if fpaths: ok = True info('Checking expected results...') if not isinstance(fpaths, list): fpaths = [fpaths] for fpath in fpaths: if isinstance(fpath, basestring): if not verify_file(fpath): ok = False if ok: info('The results are good.') if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir'])
def main(): if len(sys.argv[1]) < 0: critical('Usage: ' + __file__ + ' Input_BED_file -g hg19 -o Annotated_BED_file') input_bed_fpath = verify_bed(sys.argv[1], is_critical=True, description='Input BED file for ' + __file__) cnf = read_opts_and_cnfs( description= 'Annotating BED file based on reference features annotations.', extra_opts=[ (['--reference'], dict(dest='reference')), ], required_keys=['output_file'], file_keys=['reference'], key_for_sample_name=None, fpath_for_sample_name=input_bed_fpath, main_output_is_file=True) check_system_resources(cnf) check_genome_resources(cnf) chr_order = get_chrom_order(cnf) features_fpath = adjust_path(cnf.genome.bed_annotation_features) if not verify_bed(features_fpath, 'Annotated reference BED file'): critical('Annotated reference is required') # features_and_beds = _split_reference_by_priority(cnf, features_fpath) bed = BedTool(input_bed_fpath).cut([0, 1, 2]) info() annotated = None off_targets = None for feature in ['CDS', 'Exon', 'Transcript', 'Gene']: if bed: info('Extracting ' + feature + ' features from ' + features_fpath) features_bed = BedTool(features_fpath).filter( lambda x: x[6] == feature) info('Annotating based on ' + feature) new_annotated, off_targets = _annotate(cnf, bed, features_bed, chr_order) if not annotated: annotated = new_annotated for a in annotated: a.feature = feature else: annotated.extend(new_annotated) if off_targets: bed = BedTool([(r.chrom, r.start, r.end) for r in off_targets]) # off_target_fpath = _save_regions(off_targets, join(work_dirpath, 'off_target_1.bed')) # log('Saved off target1 to ' + str(off_target_fpath)) info() if annotated is not None and off_targets is not None: annotated.extend(off_targets) info() info('Saving annotated regions to ' + str(cnf.output_file)) with open(cnf.output_file, 'w') as out: for region in sorted(annotated, key=lambda r: r.get_key()): out.write(str(region)) # for r, overlap_size in overlaps: # sys.stdout.write('\t' + '\t'.join([ # r.chrom, '{:,}'.format(r.start), '{:,}'.format(r.end), r.gene, r.exon, str(r.strand), r.feature, r.biotype, # str(overlap_size), # '{:.2f}%'.format(100.0 * overlap_size / (r.end - r.start)) # ])) # sys.stdout.write('\n') info('Done.')
def main(args): cnf = read_opts_and_cnfs( extra_opts=[ (['--vcf', '--var'], dict( dest='vcf', help='variants to filter') ), (['--vcf2txt'], dict( dest='vcf2txt', help='variants in vcf2txt to filter') ), (['--cohort-freqs'], dict( dest='cohort_freqs_fpath', help='frequencies of variants in a cohort') ), (['--qc'], dict( dest='qc', action='store_true', default=True, help=SUPPRESS_HELP) ), (['--no-qc'], dict( dest='qc', action='store_false', help=SUPPRESS_HELP) ), (['--no-tsv'], dict( dest='tsv', action='store_false', default=True, help=SUPPRESS_HELP) ), ], required_keys=['vcf'], file_keys=['vcf'], key_for_sample_name='vcf', proc_name=source.varfilter_name + '_post') check_system_resources(cnf, required=['perl']) check_genome_resources(cnf) if not cnf.output_file: cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt') safe_mkdir(dirname(cnf.output_file)) safe_mkdir(cnf.output_dir) if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'): verify_vcf(cnf.vcf, is_critical=True) if not cnf.vcf2txt: vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file) if not vcf2txt_res_fpath: critical('vcf2txt run returned non-0') info('Saved vcf2txt output to ' + vcf2txt_res_fpath) else: cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True) info('Input is vcf2txt output, grepping by sample name ' + cnf.sample) vcf2txt_res_fpath = cnf.output_file with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx: with open(cnf.vcf2txt) as f, open(tx, 'w') as out: for i, l in enumerate(f): if l.strip(): if i == 0: out.write(l) else: if l.split('\t')[0] == cnf.sample: out.write(l) info('Using vcf2txt from ' + vcf2txt_res_fpath) # if is_local(): # vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl')) # info('Running vardict2mut perl') # res = run_vardict2mut(cnf, vcf2txt_res_fpath, # add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'), # vardict2mut_executable=vardict2mut_pl) # if not res: # critical('vardict2mut.pl run returned non-0') mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix)) if not mut_fpath: err('vardict2mut failed') else: info('Saved passed mutations to ' + mut_fpath) var_s = source.VarSample(cnf.sample, cnf.output_dir) var_s.anno_vcf_fpath = cnf.vcf var_s.varfilter_dirpath = var_s.dirpath ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0] ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt')) var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz' var_s.variants_fpath = vcf2txt_res_fpath var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix) ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass') var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass') filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath) index_vcf(cnf, var_s.name, filt_vcf, cnf.caller) index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller) if cnf.qc: report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s) qc_dirpath = join(cnf.output_dir, 'qc') safe_mkdir(qc_dirpath) qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name) info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')') info('-' * 70) info() if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir']) info() info('*' * 70) info('Done filtering ' + var_s.name)