def get_vcf_readers(mutations_by_experiment, cur_group_num): vcf_readers, filt_vcf_readers = dict(), dict() for e, muts in mutations_by_experiment.items(): if not cur_group_num or get_group_num(e.key) == cur_group_num: variant_caller = 'vardict' if 'vardict' in e.sample.variantcallers else 'vardict-java' if e.sample.vcf_by_callername.get(variant_caller): vcf_fpath = e.sample.vcf_by_callername.get(variant_caller) filt_vcf_fpath = e.sample.find_filt_vcf_by_callername( variant_caller) if vcf_fpath: vcf_readers[e] = vcf.Reader(open_gzipsafe(vcf_fpath, 'r')) if filt_vcf_fpath: filt_vcf_readers[e] = vcf.Reader( open_gzipsafe(filt_vcf_fpath, 'r')) return vcf_readers, filt_vcf_readers
def get_transcipts_with_exons_from_features(features_file, cur_chrom=None): transcripts = defaultdict(list) with open_gzipsafe(adjust_path(features_file)) as in_f: for line in in_f: if line.startswith('#'): continue fields = line.strip('\n').split('\t') chrom = fields[0] if cur_chrom and chrom != cur_chrom: continue feature_type = fields[6] if feature_type not in ['Exon', 'CDS', 'UTR']: continue start = int(fields[1]) stop = int(fields[2]) transcript_id = fields[8] exon = { 'transcript_id': transcript_id, 'chrom': chrom, 'start': start, 'stop': stop } transcripts[(transcript_id, chrom)].append(exon) return transcripts
def _get_gene_transcripts_id(cnf): genes_dict = dict() transcripts_dict = dict() if not cnf.genome.all_transcripts: critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.') if not verify_file(cnf.genome.all_transcripts): critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.') info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts) with open_gzipsafe(cnf.genome.all_transcripts) as f: for i, l in enumerate(f): if l.startswith('#'): continue chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t') if feature != 'transcript': continue try: _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:])) for t in props_line.split(';') if t.strip()) except ValueError: sys.stderr.write(format_exc()) sys.stderr.write(l) gene_symbol = _rm_quotes(_prop_dict['gene_name']) gene_id = _rm_quotes(_prop_dict['gene_id']) transcript_id = _rm_quotes(_prop_dict['transcript_id']) #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id) genes_dict[gene_id] = gene_symbol transcripts_dict[transcript_id] = gene_symbol return genes_dict, transcripts_dict
def main(): dbsnp_fpath, out_fpath = get_args() info('-' * 70) info('Reading ' + dbsnp_fpath + ', writing to ' + out_fpath) with open_gzipsafe(dbsnp_fpath) as dbsnp, open(out_fpath, 'w') as out: for l in dbsnp: if l.startswith('#'): continue fs = l.replace('\n', '').split('\t') assert len(fs) == 8, str(fs) chrom, pos, rsid, ref, alt, _, _, inf = l.replace('\n', '').split('\t') alts = alt.split(',') if len(alts) > 1: caf = next( (kv.split('=')[1] for kv in inf.split(';') if kv.split('=')[0] == 'CAF'), None) if caf: cafs = caf.replace('[', '').replace(']', '').split(',')[1:] assert len(cafs) == len(alts), l for alt, caf in zip(alts, cafs): if caf != '.': l = '\t'.join([chrom, pos, rsid, ref, alt, caf ]) + '\n' out.write(l) info() info('Saved to ' + out_fpath)
def verify_vcf(vcf_fpath, silent=False, is_critical=False): if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical): return None debug('File ' + vcf_fpath + ' exists and not empty') vcf = open_gzipsafe(vcf_fpath) debug('File ' + vcf_fpath + ' opened') l = next(vcf, None) if l is None: (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath) return None if not l.startswith('##fileformat=VCF'): (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath) return None try: reader = vcf_parser.Reader(vcf) except: err('Error: cannot open the VCF file ' + vcf_fpath) if is_critical: raise else: debug('File ' + vcf_fpath + ' opened as VCF') try: rec = next(reader) except IndexError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('IndexError parsing VCF file ' + vcf_fpath) if is_critical: raise except ValueError: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('ValueError parsing VCF file ' + vcf_fpath) if is_critical: raise except StopIteration: debug('No records in the VCF file ' + vcf_fpath) if not silent: warn('VCF file ' + vcf_fpath + ' has no records.') return vcf_fpath except: err('Error: cannot parse records in the VCF file ' + vcf_fpath) debug('Other error parsing VCF file ' + vcf_fpath) if is_critical: raise else: debug('A record was read from the VCF file ' + vcf_fpath) return vcf_fpath # f = open_gzipsafe(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open_gzipsafe(output_fpath) # contents = f.read() # if not silent: # if is_critical: # critical('SnpSift failed with memory issue:\n' + contents) # else: # err('SnpSift failed with memory issue:\n' + contents) # return None # f.close() # return None # return output_fpath finally: vcf.close()
def vcf_is_empty(cnf, vcf_fpath): vcf = open_gzipsafe(vcf_fpath) reader = vcf_parser.Reader(vcf) result = True for rec in reader: result = False vcf.close() return result
def read_sample_names_from_vcf(vcf_fpath): f = open_gzipsafe(vcf_fpath) basic_fields = next( (l.strip()[1:].split() for l in f if l.strip().startswith('#CHROM')), None) if not basic_fields: critical('Error: no VCF header in ' + vcf_fpath) if len(basic_fields) < 9: return [] return basic_fields[9:]
def run_fastq(cnf, sample_name, l_r_fpath, r_r_fpath, output_dirpath, downsample_to=1e7): fastqc = get_system_path(cnf, 'fastqc', is_critical=True) java = get_system_path(cnf, 'java', is_critical=True) if downsample_to: info('Downsampling to ' + str(downsample_to)) l_fpath, r_fpath = downsample(cnf, sample_name, l_r_fpath, r_r_fpath, downsample_to, output_dir=cnf.work_dir) # Joining fastq files to run on a combination fastqc_fpath = join(cnf.work_dir, sample_name + '.fq') info('Combining fastqs, writing to ' + fastqc_fpath) with open(fastqc_fpath, 'w') as out: out.write(open_gzipsafe(l_r_fpath).read()) out.write(open_gzipsafe(r_r_fpath).read()) # Running FastQC info('Running FastQC') tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp') safe_mkdir(tmp_dirpath) cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format( **locals()) call(cnf, cmdline) # Cleaning and getting report sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): os.remove(sample_fastqc_dirpath + '.zip') fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html') verify_file(fastqc_html_fpath, is_critical=True) return sample_fastqc_dirpath
def check_file_changed(cnf, new, in_work): if not file_exists(in_work): cnf['reuse_intermediate'] = False if cnf.get('reuse_intermediate'): if (basename(in_work) != basename(new) or md5_for_file(open(in_work, 'rb')) != md5_for_file(open_gzipsafe(new, 'rb'))): info('Input file %s changed, setting "reuse_intermediate" ' 'to False.' % str(new)) cnf['reuse_intermediate'] = False
def _get_qual_threshold(input_fpath): qual_threshold = None q_filter_regex = re.compile(r'##FILTER=<ID=q(\d+),Description="Mean Base Quality Below \d+">') with open_gzipsafe(input_fpath) as f: for l in f: if not l.startswith('##'): break m = q_filter_regex.match(l) if m: qual_threshold = int(m.group(1)) break return qual_threshold
def _get_subs_and_indel_stats(vcf_fpath, chr_lengths, plot_scale): reader = vcf.Reader(open_gzipsafe(vcf_fpath, 'r')) variants_distribution = dict() for chr_name, chr_length in chr_lengths: variants_distribution[chr_name] = [0] * max(1, chr_length / plot_scale) variants_distribution['OTHER'] = 0 substituitions = OrderedDict() nucleotides = ['A', 'C', 'G', 'T'] def _add_nuc(nuc): substituitions[nuc] = OrderedDict() for nuc2 in nucleotides: if nuc != nuc2: substituitions[nuc][nuc2] = 0 for nuc in nucleotides: _add_nuc(nuc) indel_lengths = [] for rec in reader: # for variants distribution plot if rec.CHROM not in variants_distribution: variants_distribution['OTHER'] += 1 else: region_id = min((rec.POS - 1) / plot_scale, len(variants_distribution[rec.CHROM]) - 1) variants_distribution[rec.CHROM][region_id] += 1 # for substitution and indel plots for alt in rec.ALT: if rec.is_snp: if rec.REF not in substituitions: nucleotides.append(rec.REF) _add_nuc(rec.REF) if alt.sequence not in substituitions: nucleotides.append(alt.sequence) _add_nuc(alt.sequence) substituitions[rec.REF][str(alt)] += 1 elif rec.is_indel: if alt is None: indel_lengths.append(-1) else: indel_lengths.append(len(alt) - len(rec.REF)) # the last region in each chromosome is not exactly equal to plot_scale for chr_name, chr_length in chr_lengths: last_region_length = chr_length % plot_scale + (0 if chr_length < plot_scale else plot_scale) variants_distribution[chr_name][-1] = int(variants_distribution[chr_name][-1] * plot_scale / float(last_region_length)) return variants_distribution, substituitions, indel_lengths
def parse_variants(vcf_fpath, only_pass=True): variants_by_chrom = defaultdict(list) with open_gzipsafe(vcf_fpath) as vcf: for line in vcf: line = line.strip('\n') if line.startswith('##INFO=<ID=ANN'): ann_field_names = line.split('Format: ')[-1].strip('">').split('|') ann_field_names = [f.strip() for f in ann_field_names] ann_field_names[0] = ann_field_names[0].split('\'')[1] if line.startswith('#'): continue if only_pass and 'PASS' not in line: continue fields = line.split('\t') info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[7])]) annotation_array = info_field['ANN'].split(',') if 'ANN' in info_field else [] all_annotations = [dict(zip(ann_field_names, x.split('|'))) for x in annotation_array if len(ann_field_names) == len(x.split('|'))] coding_annotations = [ann for ann in all_annotations if ann['Feature_ID'].startswith('NM')] variant = dict() variant['chrom'] = fields[0] alt_alleles = fields[4].split(',') # different variant for each alt allele for i, alt_allele in enumerate(alt_alleles): annotations = [ann for ann in coding_annotations if (ann['Allele']) == alt_allele] variant['pos'], variant['ref'], variant['alt'] = get_minimal_representation(fields[1], fields[3], alt_allele) variant['transcripts'] = set() for annotation in annotations: transcript = annotation['Feature_ID'].split('.')[0] variant['transcripts'].add(transcript) variant['transcripts'] = list(variant['transcripts']) variants_by_chrom[variant['chrom']].append(variant) return variants_by_chrom
def main(): if len(sys.argv) < 4: info( 'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.' ) # info('When the gene name is found in HGNC, it get replaced with an approved name. ') # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is. ') info( ' ' ) info( 'Usage: ' ) info(' ' + __file__ + ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path + '] [additional_feature_list]') info( ' ' ) info( ' where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:' ) info( ' #Approved Symbol Previous Symbols Synonyms Chromosome Ensembl Gene ID UCSC ID(supplied by UCSC)' ) info( ' OR7E26P OR7E67P, OR7E69P, OR7E70P, OR7E68P OR1-51, OR1-72, OR1-73, OR912-95 19q13.43 ENSG00000121410 uc002qsg.3' ) info( ' ... ' ) info( ' ' ) info( ' or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz' ) info( ' 1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";' ) info( ' 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";' ) info( ' ... ' ) info( ' ' ) info( ' or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz' ) info( ' NC_000001.10 RefSeq region 1 249250621 . + . ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA' ) info( ' NC_000001.10 BestRefSeq gene 11874 14409 . + . ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true' ) info( ' NC_000001.10 BestRefSeq transcript 11874 14409 . + . ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' NC_000001.10 BestRefSeq exon 11874 12227 . + . ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' ... ' ) info( ' ' ) info( ' or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:' ) info( ' #hg19.knownGene.name hg19.knownGene.chrom hg19.knownGene.strand hg19.knownGene.txStart hg19.knownGene.txEnd hg19.knownGene.exonCount hg19.knownGene.exonStarts hg19.knownGene.exonEnds hg19.kgXref.geneSymbol' ) info( ' uc001aaa.3 chr1 + 11873 14409 3 11873,12612,13220, 12227,12721,14409, DDX11L1' ) info( ' ... ' ) info( ' ' ) info( ' Writes to Exons.bed ' ) info( ' ' ) info( 'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols' ) sys.exit(1) genome_name = sys.argv[1] seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath chr_lengths = get_chr_lengths_from_seq(seq_fpath) chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)} input_fpath = verify_file(sys.argv[2]) output_fpath = adjust_path(sys.argv[3]) synonyms_fpath = None if len(sys.argv) > 4: synonyms_fpath = verify_file(sys.argv[4]) info('Synonyms file provided ' + synonyms_fpath + '') else: info('No synonyms file provided, skipping approving') not_approved_fpath = None if len(sys.argv) > 5: not_approved_fpath = adjust_path(sys.argv[5]) with open(verify_file(canonical_transcripts_fpath)) as f: canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f) info('Reading the features...') with open_gzipsafe(input_fpath) as inp: l = inp.readline() if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'): gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath, chr_order) elif output_fpath.endswith('.gff3') or output_fpath.endswith( '.gff3.gz'): gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath, chr_order) else: gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order) if synonyms_fpath and synonyms_fpath != "''": gene_by_name_and_chrom, not_approved_gene_names = _approve( gene_by_name_and_chrom, synonyms_fpath) info('') info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.') if not_approved_fpath: with open(not_approved_fpath, 'w') as f: f.write('#Searched as\tStatus\n') f.writelines((l + '\n' for l in not_approved_gene_names)) info('Saved not approved to ' + not_approved_fpath) # with open('serialized_genes.txt', 'w') as f: # for g in gene_by_name.values(): # f.write(str(g) + '\t' + str(g.db_id) + '\n') # for e in g.exons: # f.write('\t' + str(e) + '\n') info('Found:') info(' ' + str(len(gene_by_name_and_chrom)) + ' genes') genes = gene_by_name_and_chrom.values() coding_and_mirna_genes = [ g for g in genes if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts) ] coding_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'protein_coding' for t in g.transcripts) ] coding_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'protein_coding' ] mirna_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) ] mirna_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'miRNA' ] codingmiRNA_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) and any(t.biotype == 'protein_coding' for t in g.transcripts) ] info(' ' + str(len(coding_genes)) + ' coding genes') info(' ' + str(len(coding_transcripts)) + ' coding transcripts') info(' ' + str(len(mirna_genes)) + ' miRNA genes') info(' ' + str(len(mirna_transcripts)) + ' miRNA transcripts') info(' ' + str(len(codingmiRNA_genes)) + ' genes with both coding and miRNA transcripts') info() # info('Choosing genes with exons...') # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] info('Choosing canonical...') canon_genes = choose_canonical(genes, canonical_transcripts_ids) info() info('Sorting and printing all regions...') print_genes(genes, output_fpath, canon_only=False) info() info('Sorting and printing canonical regions...') canon_output_fpath = add_suffix(output_fpath, 'canon') print_genes(canon_genes, canon_output_fpath, canon_only=True) info() info('Saved all regions to\n ' + output_fpath + '\n ' + canon_output_fpath)
def postprocess_vcf(cnf, work_dir, var_sample, caller_name, variants, mutations, vcf2txt_res_fpath): if cnf is None: global glob_cnf cnf = glob_cnf info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': writing filtered VCFs') filter_values = set(variants.values()) # Saving .anno.filt.vcf.gz and .anno.filt.pass.vcf ungz, gz = None, None if var_sample.filt_vcf_fpath.endswith('.gz'): ungz = splitext(var_sample.filt_vcf_fpath)[0] gz = var_sample.filt_vcf_fpath else: ungz = var_sample.filt_vcf_fpath gz = var_sample.filt_vcf_fpath + '.gz' if not var_sample.filt_tsv_fpath: var_sample.filt_tsv_fpath = splitext(ungz)[0] + '.tsv' if cnf.reuse_intermediate \ and verify_file(var_sample.filt_vcf_fpath, silent=True) \ and verify_file(var_sample.pass_filt_vcf_fpath, silent=True) \ and verify_file(var_sample.filt_tsv_fpath, silent=True): info(var_sample.filt_vcf_fpath + ' and ' + var_sample.pass_filt_vcf_fpath + ' exist; reusing.') else: safe_mkdir(dirname(var_sample.filt_vcf_fpath)) safe_mkdir(dirname(var_sample.pass_filt_vcf_fpath)) with open_gzipsafe(var_sample.anno_vcf_fpath) as vcf_f, \ file_transaction(work_dir, ungz) as filt_tx, \ file_transaction(work_dir, var_sample.pass_filt_vcf_fpath) as pass_tx: with open(filt_tx, 'w') as filt_f, open(pass_tx, 'w') as pass_f: info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': opened ' + var_sample.anno_vcf_fpath + ', writing to ' + ungz + ' and ' + var_sample.pass_filt_vcf_fpath) for l in vcf_f: if l.startswith('#'): if l.startswith('#CHROM'): filt_f.write( '##FILTER=<ID=vcf2txt,Description="Hard-filtered by vcf2txt.pl">\n' ) filt_f.write( '##FILTER=<ID=vardict2mut,Description="Hard-filtered by vardict2mut.pl">\n' ) for filt_val in filter_values: if filt_val != 'PASS': filt_f.write('##FILTER=<ID=' + filt_val + ',Description="">\n') filt_f.write(l) pass_f.write(l) else: ts = l.split('\t') chrom, pos, alt = ts[0], ts[1], ts[4] if (chrom, pos, alt) in mutations: ts[6] = 'PASS' filt_f.write('\t'.join(ts)) pass_f.write('\t'.join(ts)) else: if ts[6] in ['', '.', 'PASS']: ts[6] = '' filter_value = variants.get((chrom, pos, alt)) if filter_value is None: ts[6] += 'vcf2txt' elif filter_value == 'TRUE': ts[6] += 'vardict2mut' else: ts[6] += filter_value filt_f.write('\t'.join(ts)) info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': saved filtered VCFs to ' + ungz + ' and ' + var_sample.pass_filt_vcf_fpath) if False: info() info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': writing filtered TSVs') # Converting to TSV - saving .anno.filt.tsv if 'tsv_fields' in cnf.annotation and cnf.tsv: tmp_tsv_fpath = make_tsv(cnf, ungz, var_sample.name) if not tmp_tsv_fpath: err('TSV convertion didn\'t work') else: if isfile(var_sample.filt_tsv_fpath): os.remove(var_sample.filt_tsv_fpath) shutil.copy(tmp_tsv_fpath, var_sample.filt_tsv_fpath) info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': saved filtered TSV to ' + var_sample.filt_tsv_fpath) info('Done postprocessing filtered VCF.') return ungz
def downsample(cnf, sample_name, fastq_L_fpath, fastq_R_fpath, N, output_dir, suffix=None, quick=False): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ quick=True will just grab the first N reads rather than do a true downsampling """ sample_name = sample_name or splitext(''.join( lc if lc == rc else '' for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0] l_out_fpath = join(output_dir, add_suffix(basename(fastq_L_fpath), suffix or 'subset')) r_out_fpath = join(output_dir, add_suffix(basename(fastq_R_fpath), suffix or 'subset')) if cnf.reuse_intermediate and verify_file( l_out_fpath, silent=True) and verify_file(r_out_fpath, silent=True): info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.') return l_out_fpath, r_out_fpath info('Processing ' + sample_name) N = int(N) records_num = N if quick: rand_records = range(N) else: info(sample_name + ': getting number of reads in fastq...') records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4 if records_num > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) records_num = LIMIT info(sample_name + ': ' + str(records_num) + ' reads') if records_num < N: info(sample_name + ': and it is less than ' + str(N) + ', so no downsampling.') return fastq_L_fpath, fastq_R_fpath else: info(sample_name + ': downsampling to ' + str(N)) rand_records = sorted(random.sample(xrange(records_num), N)) info('Opening ' + fastq_L_fpath) fh1 = open_gzipsafe(fastq_L_fpath) info('Opening ' + fastq_R_fpath) fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath) written_records = 0 with file_transaction(cnf.work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, basestring): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 rec_no += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no)) if rec_no > records_num: info(sample_name + ' reached the limit of ' + str(records_num), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_R_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath
def read_samples_info_and_split(common_cnf, options, inputs): #TODO: _set_up_dirs(cnf) for each sample info('') info('Processing input details...') details = None for key in inputs: if options.get(key): common_cnf[key] = adjust_path(options[key]) info('Using ' + common_cnf[key]) details = [common_cnf] if not details: details = common_cnf.get('details') if not details: critical('Please, provide input ' + ', '.join(inputs) + ' in command line or in run info yaml config.') all_samples = OrderedDict() for one_item_cnf in details: if 'vcf' not in one_item_cnf: critical('ERROR: A section in details does not contain field "var".') one_item_cnf['vcf'] = adjust_path(one_item_cnf['vcf']) verify_file(one_item_cnf['vcf'], 'Input file', is_critical=True) join_parent_conf(one_item_cnf, common_cnf) work_vcf = join(one_item_cnf['work_dir'], basename(one_item_cnf['vcf'])) check_file_changed(one_item_cnf, one_item_cnf['vcf'], work_vcf) if not one_item_cnf.get('reuse_intermediate'): with open_gzipsafe(one_item_cnf['vcf']) as inp, open_gzipsafe(work_vcf, 'w') as out: out.write(inp.read()) one_item_cnf['vcf'] = work_vcf vcf_header_samples = read_sample_names_from_vcf(one_item_cnf['vcf']) # MULTIPLE SAMPELS if ('samples' in one_item_cnf or one_item_cnf.get('split_samples')) and len(vcf_header_samples) == 0: sample_cnfs = _verify_sample_info(one_item_cnf, vcf_header_samples) for header_sample_name in vcf_header_samples: if header_sample_name not in sample_cnfs: sample_cnfs[header_sample_name] = one_item_cnf.copy() if header_sample_name in all_samples: critical('ERROR: duplicated sample name: ' + header_sample_name) cnf = all_samples[header_sample_name] = sample_cnfs[header_sample_name] cnf['name'] = header_sample_name if cnf.get('keep_intermediate'): cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log') # cnf['vcf'] = extract_sample(cnf, one_item_cnf['vcf'], cnf['name']) info() # SINGLE SAMPLE else: cnf = one_item_cnf if 'bam' in cnf: cnf['bam'] = adjust_path(cnf['bam']) verify_bam(cnf['bam'], is_critical=True) cnf['name'] = splitext_plus(basename(cnf['vcf']))[0] if cnf.get('keep_intermediate'): cnf['log'] = join(cnf['work_dir'], cnf['name'] + '.log') cnf['vcf'] = work_vcf all_samples[cnf['name']] = cnf if not all_samples: info('No samples.') else: info('Using samples: ' + ', '.join(all_samples) + '.') return all_samples
def make_report(cnf, vcf_fpath, sample): set_db_versions(cnf) step_greetings('Quality control reports') total_with_rejected = 0 total = 0 snps = 0 inss = 0 dels = 0 dbsnps = 0 cosmics = 0 novels = 0 hets = 0 homs = 0 transitions = 0 transversions = 0 with open_gzipsafe(vcf_fpath) as f: reader = vcf_parser.Reader(f) for rec in (vcf_processing.Record(rec, vcf_fpath, i) for i, rec in enumerate(reader)): total_with_rejected += 1 if not rec.FILTER or rec.FILTER == 'PASS': if rec.FILTER: warn('Warn: ' + rec.get_variant() + ' FILTER=' + str(rec.FILTER)) total += 1 if rec.is_snp: snps += 1 if rec.is_transition: transitions += 1 elif len(rec.ALT) == 1: transversions += 1 elif rec.is_indel: if rec.is_deletion: dels += 1 elif len(rec.ALT) == 1: inss += 1 if not rec.ID: novels += 1 else: ids = rec.ID if isinstance(ids, basestring): ids = [ids] if any(id.startswith('COS') for id in ids): cosmics += 1 if any(id.startswith('rs') for id in ids): dbsnps += 1 call = rec.samples[0] if call.called: if call.gt_type == 1: hets += 1 elif call.gt_type == 2: homs += 1 report = SampleReport(sample, metric_storage=metric_storage) report.add_record('Total variants', total) report.add_record('SNPs', snps) report.add_record('Insertions', inss) report.add_record('Deletions', dels) report.add_record('Novel', novels) report.add_record('Novel, %', 1.0 * novels / total if total else None) report.add_record('In dbSNP', dbsnps) report.add_record('In dbSNP, %', 1.0 * dbsnps / total if total else None) report.add_record('In Cosmic', cosmics) report.add_record('In Cosmic, %', 1.0 * cosmics / total if total else None) report.add_record('Het/hom', float(hets) / homs if homs != 0 else None) report.add_record( 'Ti/tv', float(transitions) / transversions if transversions != 0 else None) report.add_record('Total with rejected', total_with_rejected) return report
def convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample, output_dir=None, pass_only=False): info('') info('Preparing data for ' + sample.name) anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name) if not anno_filt_vcf_fpath: return None, None if not output_dir: output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath) output_vcf_fpath = join( output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending) pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass') if cnf.reuse_intermediate and verify_vcf( output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath + '.gz'): info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath + '.gz exists, reusing') return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz' info('Parsing PASS and REJECT mutations...') pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts( cnf, bs, sample, pass_only=pass_only) sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict) info('') info('Writing VCFs') vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r')) vcf_reader = add_keys_to_header(vcf_reader, filter_values) with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \ file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx: vcf_writer = None if not pass_only: vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader) vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader) for key, mut in sorted_mut_dict.items(): record = get_record_from_vcf(vcf_reader, mut) if record: if key in pass_mut_dict: record.FILTER = ['PASS'] if mut.reason: record.INFO['Reason'] = mut.reason.replace(' ', '_') elif pass_only: continue elif key in reject_mut_dict: if not mut.reason: continue reject_reason_ids = [ filter_descriptions_dict[reason] if reason in filter_descriptions_dict else reason for reason in mut.reason.split(' and ') ] record.FILTER = [';'.join(reject_reason_ids)] if mut.signif: record.INFO['Signif'] = mut.signif if mut.status: record.INFO['Status'] = mut.status if vcf_writer: vcf_writer.write_record(record) if key in pass_mut_dict: vcf_pass_writer.write_record(record) else: warn('No record was found in ' + anno_filt_vcf_fpath + ' for mutation ' + str(mut)) output_gzipped_vcf_fpath = None if vcf_writer: vcf_writer.close() output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath) info('VCF file for vardict.txt is saved to ' + output_gzipped_vcf_fpath) vcf_pass_writer.close() output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath) info('VCF file for vardict.PASS.txt is saved to ' + output_gzipped_pass_vcf_fpath) return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath