def check_genome_resources(cnf): if cnf.genome is None: critical('Please, specify genome build (one of available in ' + cnf.sys_cnf + ') using the --genome option (e.g., --genome hg38).') if not cnf.genomes: critical('"genomes" section is not specified in system config ' + cnf.sys_cnf) info('Genome: ' + str(cnf.genome.name)) for key in cnf.genome.keys(): if key != 'name' and isinstance(cnf.genome[key], basestring): cnf.genome[key] = adjust_system_path(cnf.genome[key]) if not verify_obj_by_path(cnf.genome[key], key, silent=True): if not cnf.genome[key].endswith('.gz') and verify_file( cnf.genome[key] + '.gz', silent=True): gz_fpath = cnf.genome[key] + '.gz' if verify_file(gz_fpath, silent=True): cnf.genome[key] = gz_fpath if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds: warn( 'Warning: features and bed_annotation_features and cds in the system config (' + cnf.sys_cnf + ') must be specified.') if not cnf.transcripts_fpath: cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts( cnf.genome.name, ensembl=True)
def main(): info(' '.join(sys.argv)) info() parser = OptionParser(usage='Usage: ' + basename(__file__) + ' --chr chr --vcf VCF_file --samples Sample1,Sample2 ' '--bams BAM_file1,BAM_file2 -o Output_directory ' '--features BED_file') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('-o', dest='output_dir') parser.add_option('--samples', dest='sample_names') parser.add_option('--bams', dest='bams') parser.add_option('--vcf', dest='vcf_fpath') parser.add_option('--chr', dest='chrom') parser.add_option('--features', dest='features', help='BED file with real CDS/Exon/Gene/Transcript regions with ' 'annotations (default "features" is in system_config)') (opts, args) = parser.parse_args(sys.argv[1:]) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) cnf.verbose = False if not cnf.output_dir or not cnf.vcf_fpath or not cnf.chrom: critical(parser.usage) cnf.features = cnf.features or cnf.genome.features samples = [BaseSample(sample_name, None, bam=bam) for (sample_name, bam) in zip(cnf.sample_names.split(','), cnf.bams.split(','))] split_bams(cnf, samples, cnf.vcf_fpath) info('Done.')
def main(): info(' '.join(sys.argv)) info() parser = OptionParser( usage='Usage: ' + basename(__file__) + ' --bed BED_file --bam BAM_file -g hg19 -o Output_BEDGRAPH_file ' '--work-dir work_directory --chr chromosome') parser.add_option('-o', dest='output_dir') parser.add_option('--samples', dest='sample_names') parser.add_option('--bams', dest='bams') parser.add_option('--vcf', dest='vcf_fpath') parser.add_option('--chr', dest='chrom') parser.add_option('--bed', dest='bed', help='BED file.') parser.add_option('-g', '--genome', dest='chr_len_fpath', help='File with chromosomes lengths.') parser.add_option('--work-dir', dest='work_dir', help='Work directory.') (opts, args) = parser.parse_args(sys.argv[1:]) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) samples = [ BaseSample(sample_name, None, bam=bam) for (sample_name, bam) in zip(cnf.sample_names.split(','), cnf.bams.split(',')) ] if not cnf.output_dir or not cnf.bams: critical(parser.usage) safe_mkdir(cnf.output_dir) safe_mkdir(cnf.work_dir) get_regions_coverage(cnf, samples) info('Done.')
def proc_fastq(cnf, sample, l_fpath, r_fpath): if cnf.downsample_to: info('Downsampling the reads to ' + str(cnf.downsample_to)) l_fpath, r_fpath = downsample(cnf, sample.nname, l_fpath, r_fpath, cnf.downsample_to, output_dir=cnf.work_dir, suffix='subset') sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) bwa = get_system_path(cnf, 'bwa') bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not (sambamba and bwa and bammarkduplicates): critical( 'sambamba, BWA, and bammarkduplicates are required to align BAM') info() info('Aligning reads to the reference') bam_fpath = align(cnf, sample, l_fpath, r_fpath, sambamba, bwa, bammarkduplicates, cnf.genome.bwa, cnf.is_pcr) bam_fpath = verify_bam(bam_fpath) if not bam_fpath: critical('Sample ' + sample + ' was not aligned successfully.') return bam_fpath
def _get_gene_transcripts_id(cnf): genes_dict = dict() transcripts_dict = dict() if not cnf.genome.all_transcripts: critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.') if not verify_file(cnf.genome.all_transcripts): critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.') info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts) with open_gzipsafe(cnf.genome.all_transcripts) as f: for i, l in enumerate(f): if l.startswith('#'): continue chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t') if feature != 'transcript': continue try: _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:])) for t in props_line.split(';') if t.strip()) except ValueError: sys.stderr.write(format_exc()) sys.stderr.write(l) gene_symbol = _rm_quotes(_prop_dict['gene_name']) gene_id = _rm_quotes(_prop_dict['gene_id']) transcript_id = _rm_quotes(_prop_dict['transcript_id']) #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id) genes_dict[gene_id] = gene_symbol transcripts_dict[transcript_id] = gene_symbol return genes_dict, transcripts_dict
def proc_args(argv): group1_name = 'Resistant' group2_name = 'Sensitive' description = 'This script find genes with mutations presented in (almost) all samples in one groups' \ 'and (almost) not presented in another group' \ ' (default group names: Resistant vs Sensitive). Input is PASS.txt files from bcbio-postproc.' parser = OptionParser(description=description) parser.add_option( '-n', '--num-samples-limit', dest='ns', default=1, type=int, help= 'For each reported gene: max number of samples WITHOUT the gene in group1, ' 'max number of samples WITH the gene in group2') (opts, args) = parser.parse_args(argv) if len(args) == 0: critical('No PASS.txt files provided to input.') variants_fpaths = [fpath for fpath in args if file_exists(fpath)] return opts, [group1_name, group2_name], variants_fpaths
def get_chr_lengths_from_seq(seq_fpath): chr_lengths = [] if seq_fpath.endswith('.fai'): seq_fpath = splitext(seq_fpath)[0] if verify_file(seq_fpath + '.fai', silent=True): info('Reading genome index file (.fai) to get chromosome lengths') with open(adjust_path(seq_fpath + '.fai'), 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], line.split()[1] chr_lengths.append((chrom, length)) elif verify_file(seq_fpath, silent=True): info('Reading genome sequence (.fa) to get chromosome lengths') with open(adjust_path(seq_fpath), 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai') return chr_lengths
def annotate_target(cnf, target_bed): output_fpath = intermediate_fname(cnf, target_bed, 'ann') if not cnf.genome.bed_annotation_features: return output_fpath if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath features_bed = verify_bed( cnf.genome.bed_annotation_features, is_critical=True, description='bed_annotation_features in system config') # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py')) # bedtools = get_system_path(cnf, 'bedtools') annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \ '-o {output_fpath} --canonical'.format(**locals()) # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \ # '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \ # '-o {output_fpath}'.format(**locals()) call(cnf, cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = remove_comments(cnf, output_fpath) return output_fpath
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names): fastq_by_sn = OrderedDict() for sn in sample_names: sn_fastq_fpaths = sorted( [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')]) if len(sn_fastq_fpaths) == 0: err('Error: no fastq found for ' + sn) fastq_by_sn[sn] = None elif len(sn_fastq_fpaths) > 2: critical('Error: more than 2 fastq files starting with ' + sn + '_R: ' + ', '.join(sn_fastq_fpaths)) elif len(sn_fastq_fpaths) == 1: warn('Warning: only single fastq file is found for ' + sn + '. Treating as single reads.') fastq_by_sn[sn] = [ verify_file(sn_fastq_fpaths[0], description='sn_fastq_fpaths[0] for ' + str(sn)), None ] else: fastq_by_sn[sn] = [ verify_file(fpath, description='fpath from sn_fastq_fpaths for ' + str(sn)) for fpath in sn_fastq_fpaths ] return fastq_by_sn
def run_vcf2txt_vardict2mut_for_samples(cnf, var_samples, output_dirpath, vcf2txt_out_fpath, caller_name=None, threads_num=1): threads_num = min(len(var_samples), cnf.threads) info('Number of threads for filtering: ' + str(threads_num)) safe_mkdir(output_dirpath) vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples} res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath) if not res: err('vcf2txt run returned non-0') return None # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py')) # if not vardict2mut_py: # critical('vardict2mut_py not found') info('Running vardict2mut') res = run_vardict2mut( cnf, vcf2txt_out_fpath, add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix)) if not res: critical('vardict2mut.py run returned non-0') mut_fpath = res mut_fpath = convert_gpfs_path_to_url(mut_fpath) info() info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath)) return mut_fpath
def read_samples(sample2bam_fpath): bam_fpaths = [] sample_names = [] bad_bam_fpaths = [] info('Reading sample info from ' + sample2bam_fpath) with open(sample2bam_fpath) as f: for l in f: if l.startswith('#'): continue l = l.replace('\n', '') if not l: continue sample_name = None if len(l.split('\t')) == 2: sample_name, bam_fpath = l.split('\t') else: sample_name, bam_fpath = None, l if not verify_bam(bam_fpath): bad_bam_fpaths.append(bam_fpath) bam_fpath = verify_bam(bam_fpath, is_critical=True) bam_fpaths.append(bam_fpath) if sample_name is None: sample_name = basename(splitext(bam_fpath)[0]) if sample_name.endswith('-ready'): sample_name = sample_name.split('-ready')[0] sample_names.append(sample_name) info(sample_name + ': ' + bam_fpath) if bad_bam_fpaths: critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths)) return sample_names, bam_fpaths
def tx_tmpdir(base_dir, rollback_dirpath): """Context manager to create and remove a transactional temporary directory. """ # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4())) # unique_attempts = 0 # while os.path.exists(tmp_dir_base): # if unique_attempts > 5: # break # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4())) # time.sleep(1) # unique_attempts += 1 # if base_dir is not None: # tmp_dir_base = os.path.join(base_dir, "tx") # else: # tmp_dir_base = os.path.join(os.getcwd(), "tx") if exists(rollback_dirpath): critical(rollback_dirpath + ' already exists') tmp_dir = tempfile.mkdtemp(dir=base_dir) safe_mkdir(tmp_dir) try: yield tmp_dir finally: if tmp_dir and exists(tmp_dir): os.rename(tmp_dir, rollback_dirpath)
def proc_args(argv): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input. ' \ 'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir' parser = OptionParser(description=description, usage=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c')) parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis') parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :') parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.') parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: parser.print_usage() sys.exit(1) if len(args) == 1 and not args[0].endswith('.bam'): sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv')) bam_by_sample = OrderedDict() for s, b in zip(sample_names, bam_fpaths): bam_by_sample[s] = b else: bam_by_sample = find_bams(args) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) check_genome_resources(cnf) cnf.output_dir = adjust_path(cnf.output_dir) verify_dir(dirname(cnf.output_dir), is_critical=True) safe_mkdir(cnf.output_dir) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'Seq2C' set_up_dirs(cnf) samples = [ source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath) for s_name, bam_fpath in bam_by_sample.items()] info('Samples: ') for s in samples: info(' ' + s.name) samples.sort(key=lambda _s: _s.key_to_sort()) target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) return cnf, samples, target_bed, cnf.output_dir
def get_system_path(cnf, interpreter_or_name, name=None, extra_warning='', suppress_warn=False, is_critical=False): """ "name" can be: - key in system_into.yaml - relative path in the project (e.g. external/...) - anything in system path """ interpreter = interpreter_or_name if name is None: name = interpreter_or_name interpreter = None if interpreter: if interpreter == 'java': return get_java_tool_cmdline(cnf, name, extra_warning, suppress_warn, is_critical=is_critical) return get_script_cmdline(cnf, interpreter, name, extra_warning=extra_warning, suppress_warn=suppress_warn, is_critical=is_critical) # IN SYSTEM CONFIG? if cnf and (cnf.resources is not None and name.lower() in cnf.resources and 'path' in cnf.resources[name.lower()]): tool_path = cnf.resources[name.lower()]['path'] tool_path = adjust_system_path(tool_path) return verify_obj_by_path(tool_path, name, is_critical=is_critical) # IN PROJECT ROOT DIR? IN EXTERNAL? for dirpath in [code_base_path]: tool_path = join(dirpath, name) if exists(tool_path): return verify_obj_by_path(tool_path, name, is_critical=is_critical) # IN PATH? tool_path = which(name) if tool_path and exists(tool_path): return verify_obj_by_path(tool_path, name, is_critical=is_critical) msg = (name + ' was not found. You may either specify path in the system ' 'config, or load into your PATH environment variable. ' + extra_warning) if not suppress_warn: err(msg) if is_critical: critical(msg) return None
def _proc_path(path): starts = {'/mnt/Datasets': '/ngs/oncology/datasets', '/mnt/HiSeq': '/ngs/oncology/datasets/HiSeq/', '/mnt/MiSeq': '/ngs/oncology/datasets/MiSeq/'} if not any(path.startswith(s) for s in starts.keys()): critical('Error: path ' + path + ' has to start with something from ' + str(starts.keys())) for k, v in starts.iteritems(): path = path.replace(k, v) return path
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath), Loader=Loader) except: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def read_sample_names_from_vcf(vcf_fpath): f = open_gzipsafe(vcf_fpath) basic_fields = next( (l.strip()[1:].split() for l in f if l.strip().startswith('#CHROM')), None) if not basic_fields: critical('Error: no VCF header in ' + vcf_fpath) if len(basic_fields) < 9: return [] return basic_fields[9:]
def main(): if len(sys.argv) <= 2: critical('Usage: ' + __file__ + ' path_to_.fa') seq_fpath = sys.argv[1] seq_fpath = verify_file(seq_fpath, is_critical=True) chr_lengths = get_chr_lengths_from_seq(seq_fpath) for c, l in chr_lengths: sys.stdout.write(c + '\t' + str(l) + '\n')
def vcf_one_per_line(cnf, vcf_fpath): info('Converting VCF to one-effect-per-line...') oneperline_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'opl') vcfoneperline_cmline = get_script_cmdline(cnf, 'perl', join('ext_tools', 'vcfOnePerLine.pl')) call(cnf, vcfoneperline_cmline, oneperline_vcf_fpath, stdin_fpath=vcf_fpath, exit_on_error=False) info() if not verify_file(oneperline_vcf_fpath): critical('Error: vcf_one_per_line didn\'t generate output file.') return oneperline_vcf_fpath
def main(args): if len(args) < 2: critical('Usage: ' + __file__ + ' InputRootDirectory OutputRootDirectory [Build=hg38]') sys.exit(1) inp_root = adjust_path(args[0]) out_root = adjust_path(args[1]) build = 'hg38' if len(args) >= 3: build = args[2] chain_fpath = chains[build.lower()] for inp_dirpath, subdirs, files in os.walk(inp_root): for fname in files: if fname == 'sample1-cn_mops.bed': pass if fname.endswith('.bed'): inp_fpath = adjust_path(join(inp_dirpath, fname)) print inp_fpath + ': ' + str( count_bed_cols(inp_fpath)) + ' columns' out_dirpath = adjust_path( join(out_root, relpath(inp_dirpath, inp_root))) safe_mkdir(out_dirpath) out_fpath = adjust_path(join(out_dirpath, fname)) unlifted_fpath = adjust_path( join(out_dirpath, fname + '.unlifted')) cmdline = '' with open(inp_fpath) as f: fs = f.readline().split('\t') try: int(fs[6]) int(fs[7]) except: info('Cutting ' + inp_fpath) cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; ' cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"' cmdline = cmdline.format(**locals()) info(cmdline) os.system(cmdline) verify_file(out_fpath) if isfile(unlifted_fpath): if getsize(unlifted_fpath) <= 0: os.remove(unlifted_fpath) else: err('Some records were unlifted and saved to ' + unlifted_fpath)
def vcf_merge(cnf, vcf_fpaths, combined_vcf_fpath): vcf_merge_cmdline = get_system_path(cnf, join('ext_tools', 'vcftools', 'scripts', 'vcf-merge')) if vcf_merge_cmdline is None: critical('No vcf_merge in path') cmdline = vcf_merge_cmdline + ' ' + ' '.join(vcf_fpaths) perl_module_dirpath = abspath(join(dirname(__file__), pardir, pardir, 'ext_modules', 'perl_modules')) os.environ['PERL5LIB'] = perl_module_dirpath res = call(cnf, cmdline, combined_vcf_fpath, exit_on_error=False) if not res: return None
def count_mutations_freq(cnf, samples, vcf2txt_fpaths, suffix=variant_filtering.mut_pass_suffix): count_in_cohort_by_vark = defaultdict(int) total_varks = 0 total_duplicated_count = 0 total_records_count = 0 for sample_i, (sample, vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)): met_in_this_sample = set() processed_fpath = add_suffix(vcf2txt_fpath, suffix) if not isfile(processed_fpath): critical(processed_fpath + ' does not exist; please, rerun VarFilter.') with open(processed_fpath) as f: for line_i, l in enumerate(f): if line_i > 0: fs = l.replace('\n', '').split() if not fs: continue chrom, pos, db_id, ref, alt = fs[1:6] vark = ':'.join([chrom, pos, ref, alt]) if vark in met_in_this_sample: if suffix == variant_filtering.mut_pass_suffix: total_duplicated_count += 1 else: count_in_cohort_by_vark[vark] += 1 if suffix == variant_filtering.mut_pass_suffix: met_in_this_sample.add(vark) total_varks += 1 total_records_count += 1 if suffix == variant_filtering.mut_pass_suffix: info('Counted ' + str(len(count_in_cohort_by_vark)) + ' different variants ' + 'in ' + str(len(samples)) + ' samples with total ' + str(total_varks) + ' records') info('Duplicated varks for this sample: ' + str(total_duplicated_count) + ' out of total ' + str(total_records_count) + ' records. Duplicated were not counted into cohort frequencies.') freq_in_cohort_by_vark = dict() max_freq = 0 for vark, count in count_in_cohort_by_vark.items(): f = float(count) / len(samples) freq_in_cohort_by_vark[vark] = f if f > max_freq: max_freq = f if suffix == variant_filtering.mut_pass_suffix: info('Maximum frequency in cohort is ' + str(max_freq)) return freq_in_cohort_by_vark, count_in_cohort_by_vark
def get_exac_dir(cnf): if cnf.genome.name.startswith('hg19'): cnf.genome.name = 'hg19' elif cnf.genome.name.startswith('hg38'): cnf.genome.name = 'hg38' else: critical( 'Genome ' + str(cnf.genome.name) + ' is not supported. Supported genomes: hg19, hg19-noalt, hg38, hg38-noalt.' ) exac_dir = join(exac_data_dir, cnf.genome.name) # temporary dir return exac_dir
def find_raw_fastq(self, get_regexp, suf='R1'): fastq_fpaths = [ join(self.source_fastq_dirpath, fname) for fname in os.listdir(self.source_fastq_dirpath) if re.match(get_regexp(self, suf), fname) ] fastq_fpaths = sorted(fastq_fpaths) if not fastq_fpaths: critical('Error: no fastq files for the sample ' + self.name + ' were found inside ' + self.source_fastq_dirpath) info(self.name + ': found raw fastq files ' + ', '.join(fastq_fpaths)) return fastq_fpaths
def bgzip_and_tabix(cnf, vcf_fpath, tabix_parameters='', **kwargs): gzipped_fpath = join(vcf_fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if cnf.reuse_intermediate and \ file_exists(gzipped_fpath) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed VCF and index exist, reusing') return gzipped_fpath info('Compressing and tabixing VCF file, writing ' + gzipped_fpath + '(.tbi)') bgzip = get_system_path(cnf, 'bgzip') tabix = get_system_path(cnf, 'tabix') if not bgzip: err('Cannot index VCF because bgzip is not found in PATH or ' + cnf.sys_cnf) if not tabix: err('Cannot index VCF because tabix is not found in PATH or ' + cnf.sys_cnf) if not bgzip and not tabix: return vcf_fpath retrying = False while True: if isfile(tbi_fpath): os.remove(tbi_fpath) if isfile(vcf_fpath): if isfile(gzipped_fpath): os.remove(gzipped_fpath) info('BGzipping VCF') cmdline = '{bgzip} {vcf_fpath}'.format(**locals()) call(cnf, cmdline, None, **kwargs) else: if not verify_file(gzipped_fpath): err('Neither uncompressed ' + vcf_fpath + ' nor ' + gzipped_fpath + ' exist') return None info('Tabixing VCF') cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) exit_on_error = False if retrying: exit_on_error = True kwargs['exit_on_error'] = exit_on_error call(cnf, cmdline, **kwargs) if isfile(gzipped_fpath + '.tbi'): break if retrying: critical('Cannot tabix ' + vcf_fpath) if not isfile(vcf_fpath): call(cnf, 'gunzip ' + gzipped_fpath, None) retrying = True return gzipped_fpath
def main(): cnf = read_opts_and_cnfs(extra_opts=[ (['--bam'], dict(dest='bam', help='path to the BAM file')), (['--bed', '--capture', '--amplicons'], dict(dest='bed', help='capture panel/amplicons')), (['--pcr'], dict( dest='pcr', action='store_true', help='deduplication was not perfomed, thus do not try to dedup')), ], required_keys=['bam'], file_keys=['bam', 'bed'], key_for_sample_name='bam', proc_name=BCBioStructure.qualimap_name) index_bam(cnf, cnf.bam) info('Using alignment ' + cnf.bam) bed = '' if cnf.bed: bed = ' -gff ' + cnf.bed + ' ' info('Using amplicons/capture panel ' + cnf.bed) qualimap = get_system_path(cnf, 'qualimap', is_critical=True) if not qualimap: critical('Cannot find qualimap') info() mem_cmdl = '' mem_m = get_qualimap_max_mem(cnf.bam) mem = str(int(mem_m)) + 'M' mem_cmdl = ' --java-mem-size=' + mem cmdline = ( '{qualimap} bamqc --skip-duplicated -nt ' + str(cnf.threads) + mem_cmdl + ' -nr 5000 ' '-bam {cnf.bam} -outdir {cnf.output_dir} {bed} -c -gd HUMAN').format( **locals()) report_fpath = join(cnf.output_dir, 'qualimapReport.html') call(cnf, cmdline, output_fpath=report_fpath, stdout_to_outputfile=False, env_vars=dict(DISPLAY=None)) info('Qualimap report: ' + str(report_fpath))
def proc_opts(): parser = OptionParser(description='') (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) < 1: critical('First argument should be a root datasets dir') # if len(args) < 2: # info('No dataset path specified, assuming it is the current working directory') # dataset_dirpath = adjust_path(os.getcwd()) # jira_url = args[0] root_dirpath = verify_dir(args[0], is_critical=True, description='Dataset directory') # /ngs/oncology/datasets/hiseq/150521_D00443_0159_AHK2KTADXX info(' '.join(sys.argv)) return root_dirpath
def process_one(cnf, output_dir, bam_fpath, features_bed, features_no_genes_bed): sample = TargQC_Sample(cnf.sample, output_dir, bed=cnf.bed, bam=cnf.bam) sample.l_fpath = cnf.l_fpath sample.r_fpath = cnf.r_fpath # if not sample.bam and sample.l_fpath and sample.r_fpath: # sample.bam = proc_fastq(cnf, sample, verify_file(cnf.l_fpath), verify_file(cnf.r_fpath)) info('Using alignment ' + sample.bam) if not bam_fpath: critical(sample.name + ': BAM file is required.') target_bed = verify_file(cnf.bed, is_critical=True) if cnf.bed else None bam_fpath = verify_file(sample.bam, is_critical=True) index_bam(cnf, bam_fpath) gene_keys_list = None if cnf.prep_bed is not False: info('Preparing the BED file.') features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds( cnf, features_bed, target_bed) gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \ extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed) else: info('The BED file is ready, skipping preparing.') gene_keys_set, gene_keys_list, _, _, _ = \ extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed) picard_ins_size_hist(cnf, sample, bam_fpath, output_dir) avg_depth, gene_by_name_and_chrom, reports = make_targqc_reports( cnf, output_dir, sample, bam_fpath, features_bed, features_no_genes_bed, target_bed, gene_keys_list) # #if cnf.extended: # try: # info('Generating flagged regions report...') # flagged_report = generate_flagged_regions_report(cnf, cnf.output_dir, sample, avg_depth, gene_by_name_and_chrom) # if not flagged_report: # err('Flagged regions report was not generated') # err() # except: # err(format_exc()) return reports
def proc_args(argv): cnf = read_opts_and_cnfs( extra_opts=[ (['--bam'], dict(dest='bam', )), ], required_keys=['bam'], file_keys=['bam'], ) check_genome_resources(cnf) if not cnf.bam: critical('No bam file provided to input') if not cnf.genome: critical('Please, specify the --genome option (e.g. --genome hg19)') return cnf
def _preprocess(cnf, bed_fpath, work_dirpath, chrom_order): bed_params = BedParams() output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'prep') info('preprocessing: ' + bed_fpath + ' --> ' + output_fpath) with open(bed_fpath, 'r') as in_f: with open(output_fpath, 'w') as out_f: for line in in_f: if line.startswith('#') or line.startswith( 'track') or line.startswith('browser'): # header bed_params.header.append( line if line.startswith('#') else '#' + line) else: cur_ncn = BedParams.calc_n_cols_needed(line) if bed_params.n_cols_needed is not None and cur_ncn != bed_params.n_cols_needed: critical( 'number and type of columns should be the same on all lines!' ) bed_params.n_cols_needed = cur_ncn if line.startswith('chr'): if bed_params.GRCh_names is not None and bed_params.GRCh_names: critical('mixing of GRCh and hg chromosome names!') bed_params.GRCh_names = False if line.startswith( 'chrMT' ): # common misprint, correcting chrMT --> chrM processed_line = '\t'.join(['chrM'] + line.split('\t')[1:]) else: processed_line = line elif line.split( '\t')[0] in BedParams.GRCh_to_hg: # GRCh chr names if bed_params.GRCh_names is not None and not bed_params.GRCh_names: critical('mixing of GRCh and hg chromosome names!') bed_params.GRCh_names = True processed_line = '\t'.join( [BedParams.GRCh_to_hg[line.split('\t')[0]]] + line.split('\t')[1:]) else: critical('incorrect chromosome name!') entries = processed_line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) if r.is_control(): r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None bed_params.controls.append(r) else: out_f.write(processed_line) return output_fpath, bed_params
def _preprocess(cnf, bed_fpath, work_dirpath, chrom_order): bed_params = BedParams() output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'prep') info('preprocessing: ' + bed_fpath + ' --> ' + output_fpath) with open(bed_fpath, 'r') as in_f: with open(output_fpath, 'w') as out_f: for line in in_f: if line.startswith('#') or line.startswith('track') or line.startswith('browser'): # header bed_params.header.append(line if line.startswith('#') else '#' + line) else: cur_ncn = BedParams.calc_n_cols_needed(line) if bed_params.n_cols_needed is not None and cur_ncn != bed_params.n_cols_needed: critical('number and type of columns should be the same on all lines!') bed_params.n_cols_needed = cur_ncn if line.startswith('chr'): if bed_params.GRCh_names is not None and bed_params.GRCh_names: critical('mixing of GRCh and hg chromosome names!') bed_params.GRCh_names = False if line.startswith('chrMT'): # common misprint, correcting chrMT --> chrM processed_line = '\t'.join(['chrM'] + line.split('\t')[1:]) else: processed_line = line elif line.split('\t')[0] in BedParams.GRCh_to_hg: # GRCh chr names if bed_params.GRCh_names is not None and not bed_params.GRCh_names: critical('mixing of GRCh and hg chromosome names!') bed_params.GRCh_names = True processed_line = '\t'.join([BedParams.GRCh_to_hg[line.split('\t')[0]]] + line.split('\t')[1:]) else: critical('incorrect chromosome name!') entries = processed_line.strip().split('\t') chrom = entries[0] start = int(entries[1]) end = int(entries[2]) r = Region(chrom, chrom_order.get(chrom), start, end) if r.is_control(): r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end)) r.rest = entries[4:] if len(entries) > 4 else None bed_params.controls.append(r) else: out_f.write(processed_line) return output_fpath, bed_params