def local_symlink(src, dst): if os.path.exists(dst): try: os.unlink(dst) except Exception, e: err('Cannot remove link ' + dst + ': ' + str(e)) return None
def main(): cnf, samples, bed_fpath, output_dir = proc_args(sys.argv) info('Processing ' + str(len(samples)) + ' samples') if cnf.prep_bed is not False: if not bed_fpath: info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds)) bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name) seq2c_bed_fname = basename(bed_fpath) bed_cols = count_bed_cols(bed_fpath) if bed_cols < 4: check_genome_resources(cnf) _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath) try: copyfile(bed_fpath, join(output_dir, seq2c_bed_fname)) except OSError: err(format_exc()) info() else: info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname)) bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file') info('Using target ' + bed_fpath) run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
def leave_main_sample(cnf, vcf_fpath, samplename): index = get_sample_column_index(vcf_fpath, samplename) if index is None: return vcf_fpath # def _f1(rec): # rec.samples = [sample_name] # return rec # info('Keeping SAMPLE only for the first sample (' + samplename + ')') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name) # out_fpath = extract_sample(cnf, vcf_fpath, sample_name) # info() def _f(line, i): if line and (line.startswith('#CHROM') or line[0] != '#'): ts = line.split('\t') return '\t'.join(ts[:9] + [ts[9 + index]]) return line vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm') if not verify_file(vcf_fpath): err('Error: leave_first_sample didnt generate output file.') return None return vcf_fpath
def _parse_picard_dup_report(dup_report_fpath): with open(dup_report_fpath) as f: for l in f: if l.startswith('## METRICS CLASS'): l_NEXT = None ind = None try: l_LIBRARY = next(f) if l_LIBRARY.startswith('LIBRARY'): ind = l_LIBRARY.strip().split().index( 'PERCENT_DUPLICATION') l_NEXT = next(f) while l_NEXT.startswith(' ') or l_NEXT.startswith( '\t'): l_NEXT = next(f) except StopIteration: pass else: if l_NEXT and ind: fields = l_NEXT.split() if fields[0] == 'Unknown': ind += 1 if len(fields) > ind: dup_rate = 1.0 * float(fields[ind]) return dup_rate err('Error: cannot read duplication rate from ' + dup_report_fpath)
def do_handle_oserror(cmdl, out_fpath=None, stderr_dump=None, max_number_of_tries=20): res_ = None counter = 0 slept = 0 timeout = 30 limit = 60 * 10 while True: try: res_ = do(cmdl, out_fpath, stderr_dump=stderr_dump) break except OSError, e: counter += 1 if counter >= max_number_of_tries: break if not silent: err('OSError: ' + str(e)) err() if 'Cannot allocate memory' not in str(e): break else: if slept >= limit: return None else: if not silent: err('Waiting ' + str(timeout) + ' seconds...') time.sleep(timeout) slept += timeout if not silent: err('Retrying...') err()
def extract_graphs(samples): # Sample(name, fastq_fpath) parsed_data = OrderedDict((h, list()) for h in _header) for s in samples: if verify_file(s.fastqc_html_fpath, 's.fastqc_html_fpath for ' + s.name): with open(s.fastqc_html_fpath) as source_file_obj: html = source_file_obj.read() parts = [ p.split('</div>')[0] for p in html.split('<div class="module">')[1:] ] # <h2><img/></h2><table></table></div> OR <h2><img/></h2><p><img/></p></div> for i, part in enumerate(parts): # info('Parsing ' + _header[i]) # info(str(part)) table, graph = '', '' ok_img = '<img ' + part.split('"><img')[1].split( '>')[0] + '>' if '<table>' in part: table = '<table>' + part.split('<table>')[1] if '<p><img ' in part: graph = '<img ' + part.split('<p><img')[1].split( '>')[0] + '>' parsed_data[_header[i]].append( [s.name, ok_img, graph, table]) # module_divs = soup.find_all("div", class_="module") # _sort_graph_by_type(parsed_data, module_divs, s.name) # soup.decompose() else: err('Could not find fastqc html fpath for sample ' + s.name + ': ' + str(s.fastqc_html_fpath)) return parsed_data
def __parse_id(url): t = url.split('NGSG-') if len(t) == 1: err('Incorrect JIRA URL ' + url) return None case_id = t[1].split('?')[0] return case_id
def index_vcf(cnf, sample_name, filt_vcf_fpath, caller_name=None): if cnf is None: global glob_cnf cnf = glob_cnf info() info(sample_name + ((', ' + caller_name) if caller_name else '') + ': indexing') # for fpath in [pass_vcf_fpath, filt_vcf_fpath]: # if not cnf.reuse_intermediate and not verify_file(fpath, silent=True): # err(fpath + ' does not exist - cannot IGV index') # else: # if cnf.reuse_intermediate and verify_file(fpath + '.idx', silent=True): # info('Reusing existing ' + fpath + '.idx') # else: # igvtools_index(cnf, fpath) if not cnf.reuse_intermediate and not verify_file(filt_vcf_fpath, silent=True): err(filt_vcf_fpath + ' does not exist - cannot gzip and tabix') else: if cnf.reuse_intermediate and verify_file(filt_vcf_fpath + '.gz', silent=True) \ and verify_file(filt_vcf_fpath + '.gz.tbi', silent=True): info(filt_vcf_fpath + '.gz and .gz.tbi exist; reusing') else: bgzip_and_tabix(cnf, filt_vcf_fpath)
def main(): args = sys.argv[1:] if len(args) < 2: sys.exit('Usage: ' + __file__ + ' bam sambamba cmdline') bam = args[0] sambamba = args[1] args = args[2:] args = [a.replace('__QUOTE__', '"').replace('""', '"') for a in args] err(str(args)) index_bam(bam, sambamba) err() args = [sambamba] + args cmdl = ' '.join( (('"' + a + '"') if ' ' in a and not a[0] == '"' else a) for a in args) err(cmdl) ret_code = subprocess.call(cmdl, shell=True) if ret_code != 0: err() err('Ret code = ' + str(ret_code) + ', retrying...') indexed_bam = bam + '.bai' if isfile(indexed_bam): os.remove(indexed_bam) index_bam(bam, sambamba) subprocess.call(cmdl, shell=True)
def launch_bedcoverage_hist(work_dir, bed, bam, chr_lengths_fpath, bedcov_output_fpath=None, bedtools='bedtools'): if not bedcov_output_fpath: bedcov_output_fpath = join( work_dir, splitext_plus(basename(bed))[0] + '__' + splitext_plus(basename(bam))[0] + '_bedcov_output.txt') if bam.endswith('bam'): bam = bam_to_bed_nocnf(bam, bedtools) verify_file(bam, is_critical=True, description='BAM to BED conversion result') v = bedtools_version(bedtools) if v and v >= 24: cmdline = '{bedtools} coverage -sorted -g {chr_lengths_fpath} -a {bed} -b {bam} -hist'.format( **locals()) else: cmdline = '{bedtools} coverage -a {bam} -b {bed} -hist'.format( **locals()) cmdline += ' > ' + bedcov_output_fpath info(cmdline) os.system(cmdline) res = verify_file(bedcov_output_fpath) if res: info('Done, saved to ' + bedcov_output_fpath) else: err('Error, result is non-existent or empty')
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names): fastq_by_sn = OrderedDict() for sn in sample_names: sn_fastq_fpaths = sorted( [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')]) if len(sn_fastq_fpaths) == 0: err('Error: no fastq found for ' + sn) fastq_by_sn[sn] = None elif len(sn_fastq_fpaths) > 2: critical('Error: more than 2 fastq files starting with ' + sn + '_R: ' + ', '.join(sn_fastq_fpaths)) elif len(sn_fastq_fpaths) == 1: warn('Warning: only single fastq file is found for ' + sn + '. Treating as single reads.') fastq_by_sn[sn] = [ verify_file(sn_fastq_fpaths[0], description='sn_fastq_fpaths[0] for ' + str(sn)), None ] else: fastq_by_sn[sn] = [ verify_file(fpath, description='fpath from sn_fastq_fpaths for ' + str(sn)) for fpath in sn_fastq_fpaths ] return fastq_by_sn
def run_vcf2txt_vardict2mut_for_samples(cnf, var_samples, output_dirpath, vcf2txt_out_fpath, caller_name=None, threads_num=1): threads_num = min(len(var_samples), cnf.threads) info('Number of threads for filtering: ' + str(threads_num)) safe_mkdir(output_dirpath) vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples} res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath) if not res: err('vcf2txt run returned non-0') return None # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py')) # if not vardict2mut_py: # critical('vardict2mut_py not found') info('Running vardict2mut') res = run_vardict2mut( cnf, vcf2txt_out_fpath, add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix)) if not res: critical('vardict2mut.py run returned non-0') mut_fpath = res mut_fpath = convert_gpfs_path_to_url(mut_fpath) info() info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath)) return mut_fpath
def _tracks(cnf, track_fpath, input_fpath): if not verify_file(track_fpath): return None field_name = splitext_plus(basename(track_fpath))[0] step_greetings('Intersecting with ' + field_name) output_fpath = intermediate_fname(cnf, input_fpath, field_name) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath toolpath = get_system_path(cnf, 'vcfannotate') if not toolpath: err('WARNING: Skipping annotation with tracks: vcfannotate ' 'executable not found, you probably need to specify path in system_config, or ' 'run load bcbio: . /group/ngs/bin/bcbio-prod.sh"') return None # self.all_fields.append(field_name) cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format( **locals()) assert input_fpath output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, overwrite=True) if not verify_vcf(output_fpath): err('Error: tracks resulted ' + str(output_fpath) + ' for ' + track_fpath) return output_fpath # Set TRUE or FALSE for tracks def proc_line(line, i): if field_name in line: if not line.startswith('#'): fields = line.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if pair[0] == field_name and len(pair) > 1 else pair for pair in info_pairs] info_line = ';'.join( '='.join(pair) if len(pair) == 2 else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return line assert output_fpath output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk') return verify_vcf(output_fpath, is_critical=True)
def index_bam(bam_fpath, sambamba): indexed_bam = bam_fpath + '.bai' if isfile(indexed_bam): return # if not isfile(indexed_bam) or getctime(indexed_bam) < getctime(bam_fpath): err('Indexing BAM, writing ' + indexed_bam + '...') cmdline = '{sambamba} index {bam_fpath}'.format(**locals()) subprocess.call(cmdline, shell=True)
def count_bed_cols(bed_fpath): with open(bed_fpath) as f: for l in f: if l and l.strip() and not l.startswith('#'): return len(l.split('\t')) # return len(next(dropwhile(lambda x: x.strip().startswith('#'), open(bed_fpath))).split('\t')) err('Empty bed file: ' + bed_fpath) return None
def get_system_path(cnf, interpreter_or_name, name=None, extra_warning='', suppress_warn=False, is_critical=False): """ "name" can be: - key in system_into.yaml - relative path in the project (e.g. external/...) - anything in system path """ interpreter = interpreter_or_name if name is None: name = interpreter_or_name interpreter = None if interpreter: if interpreter == 'java': return get_java_tool_cmdline(cnf, name, extra_warning, suppress_warn, is_critical=is_critical) return get_script_cmdline(cnf, interpreter, name, extra_warning=extra_warning, suppress_warn=suppress_warn, is_critical=is_critical) # IN SYSTEM CONFIG? if cnf and (cnf.resources is not None and name.lower() in cnf.resources and 'path' in cnf.resources[name.lower()]): tool_path = cnf.resources[name.lower()]['path'] tool_path = adjust_system_path(tool_path) return verify_obj_by_path(tool_path, name, is_critical=is_critical) # IN PROJECT ROOT DIR? IN EXTERNAL? for dirpath in [code_base_path]: tool_path = join(dirpath, name) if exists(tool_path): return verify_obj_by_path(tool_path, name, is_critical=is_critical) # IN PATH? tool_path = which(name) if tool_path and exists(tool_path): return verify_obj_by_path(tool_path, name, is_critical=is_critical) msg = (name + ' was not found. You may either specify path in the system ' 'config, or load into your PATH environment variable. ' + extra_warning) if not suppress_warn: err(msg) if is_critical: critical(msg) return None
def calculate_coverage_use_grid(cnf, samples, output_dirpath): assert len(samples) > 0 sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) chr_len_fpath = get_chr_len_fpath(cnf) jobs_to_wait = [] for sample in samples: sample_output_dirpath = join(output_dirpath, sample.name) safe_mkdir(sample_output_dirpath) for chrom in chromosomes: info('Processing chromosome ' + chrom) avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz') sample_output_fpaths = [ join(output_dirpath, sample.name, chrom + '.txt.gz') for sample in samples ] sample_names = ','.join(sample.name for sample in samples) chrom_bams = [] for sample in samples: if not verify_file(sample.bam): err('BAM for ' + sample.name + ' is not exist!') continue output_bam_fpath = join( cnf.work_dir, basename(sample.name) + '_' + str(chrom) + '.bam') cmdline = '{sambamba} slice {sample.bam} {chrom}'.format( **locals()) call(cnf, cmdline, output_fpath=output_bam_fpath) if verify_file(output_bam_fpath): chrom_bams.append(output_bam_fpath) bam_fpaths = ','.join(chrom_bams) if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \ all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths): info(avg_cov_output_fpath + ' exists, reusing') else: j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths, sample_names, output_dirpath, chr_len_fpath) if j and not j.is_done: jobs_to_wait.append(j) info() if len(jobs_to_wait) >= cnf.threads: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) jobs_to_wait = [] elif not jobs_to_wait: info('No jobs to submit.') if jobs_to_wait: wait_for_jobs(cnf, jobs_to_wait)
def finialize_annotate_file(cnf, vcf_fpath, sample, callername=None): # vcf_fpath = leave_first_sample(cnf, vcf_fpath) # if not cnf.no_check: # vcf_fpath = _filter_malformed_fields(cnf, vcf_fpath) if not cnf.no_check and callername and 'vardict' not in callername: info() info('Adding SAMPLE=' + sample.name + ' annotation...') vcf_fpath = add_annotation(cnf, vcf_fpath, 'SAMPLE', sample.name, number='1', type_='String', description='Sample name') final_vcf_fpath = join( cnf.output_dir, sample.name + (('-' + callername) if callername else '') + '.anno.vcf') if cnf.output_file: final_vcf_fpath = cnf.output_file if not vcf_fpath.endswith('.gz') and final_vcf_fpath.endswith('.gz'): final_vcf_fpath = splitext(final_vcf_fpath)[0] if vcf_fpath.endswith('.gz') and not final_vcf_fpath.endswith('.gz'): final_vcf_fpath = final_vcf_fpath + '.gz' info('Moving final VCF ' + vcf_fpath + ' to ' + final_vcf_fpath) if isfile(final_vcf_fpath): os.remove(final_vcf_fpath) shutil.copy(vcf_fpath, final_vcf_fpath) if cnf.qc: report = qc.make_report(cnf, final_vcf_fpath, sample) qc_dirpath = join(cnf.output_dir, 'qc') safe_mkdir(qc_dirpath) report = qc.save_report(cnf, report, sample, callername, qc_dirpath, source.varqc_name) info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')') info('-' * 70) info() if final_vcf_fpath.endswith('.gz'): if not is_gz(final_vcf_fpath): err(final_vcf_fpath + ' is in incorrect gzip format') anno_vcf_fpath_ungz = splitext(final_vcf_fpath)[0] anno_vcf_fpath_gz = final_vcf_fpath os.rename(anno_vcf_fpath_gz, anno_vcf_fpath_ungz) else: info(final_vcf_fpath + ' is a good gzipped file.') return [final_vcf_fpath] else: info('Compressing and indexing with bgzip+tabix ' + final_vcf_fpath) final_vcf_fpath = bgzip_and_tabix(cnf, final_vcf_fpath) info('Saved VCF again to ' + final_vcf_fpath) return [final_vcf_fpath]
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath), Loader=Loader) except: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def merge_patients(patients): gender = None genders = set(p.gender for p in patients if p.gender) if genders: if len(genders) > 1: err('Different genders detected for the same sample: ' + str(genders)) gender = next(genders) return Patient(gender)
def main(): cnf = proc_args(sys.argv) bigwig_fpath = process_bam(cnf, cnf.bam) if isfile(bigwig_fpath) and cnf.project_name and cnf.sample: create_jbrowse_symlink(cnf.genome.name, cnf.project_name, cnf.sample, bigwig_fpath) info('BAM was successfully converted.') elif not isfile(bigwig_fpath): err('BAM was not converted to BigWig.')
def get_db_path(cnf, dbconf, dbname): db_path = cnf['genome'].get(dbname) if not db_path: db_path = dbconf.get('path') if not db_path: err('Please, provide a path to "' + dbname + '" in the "genomes" section in the system config. The config is: ' + str(cnf['genome'])) return None return verify_file(db_path, is_critical=True)
def calc_bases_within_threshs(self, depth_thresholds): if self.bases_within_threshs is not None: return self.bases_within_threshs if self.bases_by_depth is None: err('Error: self.bases_by_depth is None for ' + str(self)) self.bases_within_threshs, self.rates_within_threshs = calc_bases_within_threshs( self.bases_by_depth, self.get_size(), depth_thresholds) return self.bases_within_threshs
def __init__(self, dirpath, az_prjname_by_subprj=None, samplesheet=None): info('Parsing the HiSeq project structure') self.kind = 'hiseq' DatasetStructure.__init__(self, dirpath, az_prjname_by_subprj, samplesheet=samplesheet) verify_dir(self.unaligned_dirpath, is_critical=True) self.basecall_stat_html_reports = self.__get_basecall_stats_reports() for pname, project in self.project_by_name.items(): proj_dirpath = join( self.unaligned_dirpath, 'Project_' + pname.replace( ' ', '-')) #.replace('-', '_').replace('.', '_')) az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance( az_prjname_by_subprj, basestring) else az_prjname_by_subprj if az_proj_name is None: if len(self.project_by_name) > 1: warn( 'Warn: cannot correspond subproject ' + pname + ' and project names and JIRA cases. ' 'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting' ) continue az_proj_name = az_prjname_by_subprj.values()[0] project.set_dirpath(proj_dirpath, az_proj_name) for sname, sample in project.sample_by_name.items(): sample.source_fastq_dirpath = join( project.dirpath, 'Sample_' + sname.replace( ' ', '-')) #.replace('-', '_').replace('.', '_')) sample.set_up_out_dirs(project.fastq_dirpath, project.fastqc_dirpath, project.downsample_targqc_dirpath) basecalls_symlink = join(project.dirpath, 'BaseCallsReports') if not exists(basecalls_symlink): info('Creating BaseCalls symlink ' + self.basecalls_dirpath + ' -> ' + basecalls_symlink) try: os.symlink(self.basecalls_dirpath, basecalls_symlink) except OSError: err('Cannot create symlink') traceback.print_exc() else: info('Created') if exists(basecalls_symlink): self.basecalls_dirpath = basecalls_symlink self.get_fastq_regexp_fn = get_hiseq_regexp
def parse_gene_counts(counts_fpath, key_gene_names, report_name, keep_gene_names): gene_counts = defaultdict(list) info('Preparing ' + report_name + ' stats for expression heatmaps') info('Checking ' + counts_fpath) if not verify_file(counts_fpath): err('Cannot find ' + report_name + ' fpath') return [] info('Reading ' + report_name + ' from ' + counts_fpath) samples_cols = dict() samples = [] gene_col = None with open(counts_fpath) as f: for i, l in enumerate(f): if i == 0: header = l.strip().split('\t') gene_col = header.index('HUGO') samples = header[1:gene_col] samples_cols = { sample: col + 1 for col, sample in enumerate(samples) } continue fs = l.replace('\n', '').split('\t') gene_name = fs[gene_col] if key_gene_names and gene_name not in key_gene_names: continue gene_expression_dict = { sample: int(float(fs[col])) if float(fs[col]).is_integer() else float(fs[col]) for sample, col in samples_cols.iteritems() } if all(v < HEATMAPS_MIN_COUNT for v in gene_expression_dict.values()): continue is_hidden_row = False name = gene_name if ':' in fs[0]: ## exon number is_hidden_row = True exon_number = fs[0].split(':')[1] name += ':' + exon_number if keep_gene_names: is_hidden_row = True name = fs[0] # use id gene = Counts(name, gene_name=gene_name, counts=gene_expression_dict, is_hidden_row=is_hidden_row) gene_counts[gene_name].append(gene) return gene_counts, samples
def main(args): if len(args) < 2: critical('Usage: ' + __file__ + ' InputRootDirectory OutputRootDirectory [Build=hg38]') sys.exit(1) inp_root = adjust_path(args[0]) out_root = adjust_path(args[1]) build = 'hg38' if len(args) >= 3: build = args[2] chain_fpath = chains[build.lower()] for inp_dirpath, subdirs, files in os.walk(inp_root): for fname in files: if fname == 'sample1-cn_mops.bed': pass if fname.endswith('.bed'): inp_fpath = adjust_path(join(inp_dirpath, fname)) print inp_fpath + ': ' + str( count_bed_cols(inp_fpath)) + ' columns' out_dirpath = adjust_path( join(out_root, relpath(inp_dirpath, inp_root))) safe_mkdir(out_dirpath) out_fpath = adjust_path(join(out_dirpath, fname)) unlifted_fpath = adjust_path( join(out_dirpath, fname + '.unlifted')) cmdline = '' with open(inp_fpath) as f: fs = f.readline().split('\t') try: int(fs[6]) int(fs[7]) except: info('Cutting ' + inp_fpath) cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; ' cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"' cmdline = cmdline.format(**locals()) info(cmdline) os.system(cmdline) verify_file(out_fpath) if isfile(unlifted_fpath): if getsize(unlifted_fpath) <= 0: os.remove(unlifted_fpath) else: err('Some records were unlifted and saved to ' + unlifted_fpath)
def run_seq2c(cnf, output_dirpath, samples, seq2c_bed, is_wgs): step_greetings('Running Seq2C') bams_by_sample = dict() for s in samples: if not s.bam: err('No BAM file for ' + s.name) continue bams_by_sample[s.name] = s.bam # cnf.work_dir = join(ori_work_dir, source.targqc_name + '_' + s.name) # safe_mkdir(cnf.work_dir) # s.dedup_bam = intermediate_fname(cnf, s.bam, source.dedup_bam) # dedupped_bam_by_sample[s.name] = s.dedup_bam # if verify_bam(s.dedup_bam, silent=True): # info(s.dedup_bam + ' exists') # else: # info('Deduplicating bam file ' + s.dedup_bam) # dedup_jobs.append(remove_dups(cnf, s.bam, s.dedup_bam, use_grid=True)) # cnf.work_dir = ori_work_dir # wait_for_jobs(cnf, dedup_jobs) # # ok = True # for s in samples: # if not dedupped_bam_by_sample.get(s.name) or not verify_bam(dedupped_bam_by_sample[s.name]): # err('No BAM file for ' + s.name) # ok = False # if not ok: # err('No BAM files found for any sample, cannot run Seq2C.') # return None info('Getting reads and cov stats') mapped_read_fpath = join(output_dirpath, 'mapped_reads_by_sample.tsv') mapped_read_fpath, samples = __get_mapped_reads(cnf, samples, bams_by_sample, mapped_read_fpath) info() if not mapped_read_fpath: return None combined_gene_depths_fpath = join(output_dirpath, 'cov.tsv') combined_gene_depths_fpath = __seq2c_coverage(cnf, samples, bams_by_sample, seq2c_bed, is_wgs, combined_gene_depths_fpath) info() if not combined_gene_depths_fpath: return None seq2c_report_fpath = join(output_dirpath, source.seq2c_name + '.tsv') seq2c_report_fpath = __final_seq2c_scripts(cnf, mapped_read_fpath, combined_gene_depths_fpath, seq2c_report_fpath) if not seq2c_report_fpath: return None info('Done. The results is ' + seq2c_report_fpath) return seq2c_report_fpath
def main(): root_dirpath = proc_opts() info('*' * 60) info() all_issues = [] info('Iterating over ' + root_dirpath) info('-' * 60) info() for fname in os.listdir(root_dirpath): if fname.startswith('.'): continue info(fname) project_dirpath = join(root_dirpath, fname) if isdir(project_dirpath) \ and isfile(join(project_dirpath, 'SampleSheet.csv')) \ and isdir(join(project_dirpath, 'Unalign')): info('Unalign and SampleSheet.csv found') ds = DatasetStructure.create(project_dirpath, '') issues = [] if not ds.project_by_name: err('No projects found') else: info('Projects: ' + ', '.join([p.name + ' (' + ', '.join(p.sample_by_name) + ')' for p in ds.project_by_name.values()])) for project in ds.project_by_name.values(): if not project.sample_by_name: err('No samples for project ' + project.name + ' found') else: for i, s1 in enumerate(project.sample_by_name.values()): for s2 in project.sample_by_name.values()[i + 1:]: if s2.name.startswith(s1.name): issues.append(' issued samples: ' + s1.name + ' and ' + s2.name + ' from ' + project.name) if issues: all_issues.append(fname + ' created: %s, last modified: %s' % (time.ctime(os.path.getctime(project_dirpath)), time.ctime(os.path.getmtime(project_dirpath)))) all_issues.extend(issues) all_issues.append('') info() info('-' * 60) info() info('Failed projects: ') for msg in all_issues: info(msg)
def verify_vcf(vcf_fpath, silent=False, is_critical=False): if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical): return None debug('File ' + vcf_fpath + ' exists and not empty') vcf = open_gzipsafe(vcf_fpath) debug('File ' + vcf_fpath + ' opened') l = next(vcf, None) if l is None: (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath) return None if not l.startswith('##fileformat=VCF'): (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath) return None try: reader = vcf_parser.Reader(vcf) except: err('Error: cannot open the VCF file ' + vcf_fpath) if is_critical: raise else: debug('File ' + vcf_fpath + ' opened as VCF') try: rec = next(reader) except IndexError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('IndexError parsing VCF file ' + vcf_fpath) if is_critical: raise except ValueError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('ValueError parsing VCF file ' + vcf_fpath) if is_critical: raise except StopIteration: debug('No records in the VCF file ' + vcf_fpath) if not silent: warn('VCF file ' + vcf_fpath + ' has no records.') return vcf_fpath except: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('Other error parsing VCF file ' + vcf_fpath) if is_critical: raise else: debug('A record was read from the VCF file ' + vcf_fpath) return vcf_fpath # f = open_gzipsafe(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open_gzipsafe(output_fpath) # contents = f.read() # if not silent: # if is_critical: # critical('SnpSift failed with memory issue:\n' + contents) # else: # err('SnpSift failed with memory issue:\n' + contents) # return None # f.close() # return None # return output_fpath finally: vcf.close()
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath): sambamba_output_by_experiment = dict() not_submitted_experiments = infos_by_key.values() while not_submitted_experiments: jobs_to_wait = [] submitted_experiments = [] reused_experiments = [] for (group, uniq_key), e in infos_by_key.iteritems(): if e not in not_submitted_experiments: continue sambamba_output_fpath = join(cnf.work_dir, uniq_key + '__mutations.bed') sambamba_output_by_experiment[e] = sambamba_output_fpath if cnf.reuse_intermediate and verify_file(sambamba_output_fpath, silent=True): info(sambamba_output_fpath + ' exists, reusing') reused_experiments.append(e) continue else: if not e.sample.bam: err('Sample ' + e.sample.name + ' in ' + str(group) + ', ' + str(uniq_key) + ' has no BAM') continue j = sambamba_depth(cnf, mut_bed_fpath, e.sample.bam, output_fpath=sambamba_output_fpath, only_depth=True, silent=True, use_grid=True) submitted_experiments.append(e) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_experiments = [ e for e in not_submitted_experiments if e not in submitted_experiments and e not in reused_experiments ] return sambamba_output_by_experiment