def parse_vcf(vcffile, logfh): """Parse sample VCF file.""" logfh.write("Parsing vcffile: {} -".format(vcffile)) fh = open_file(vcffile) lines = fh.readlines() fh.close() while lines: # skip vcf header if not lines[0].startswith('##'): break lines.pop(0) fields = lines.pop(0).lstrip('#').rstrip().split("\t") variants = {} for line in lines: vals = line.rstrip().split("\t") d = dict(zip(fields, vals)) if d['QUAL'][0].isdigit(): pos = int(d['POS']) variants[pos] = d logfh.write(" {} variants\n".format(len(variants))) # special case for counting (TG)9-9T # if 87842 in variants and 87823 not in variants: # variants[87823] = {'CHROM':'CFTR', 'POS':'87823', # 'REF':'ATG', 'ALT':'A' } # if 87844 in variants and 87845 not in variants: # variants[87845] = {'CHROM':'CFTR', 'POS':'87845', # 'REF':'G', 'ALT':'T' } # special case for counting (TG)13-5T # if 87846 in variants and 87823 not in variants: # variants[87823] = {'CHROM':'CFTR', 'POS':'87823', # 'REF':'A', 'ALT':'ATG' } # if 87847 in variants and variants[87847]['REF']=='T' and\ # variants[87847]['ALT']=='TG': # variants[87847]['ALT'] += ',G' return (fields, variants)
def separate_vcf(vcffile, ref, outdir, args): sys.stderr.write("\nReading {}\n".format(vcffile)) fh = open_file(vcffile) fieldlist = [ l for l in fh.readlines() if l.startswith('#CHROM') ] fh.close() if fieldlist: fields = fieldlist[0].rstrip().split("\t") samples = fields[9:] else: sys.stderr.write("Could not find VCF header in {}".format(vcffile)) sys.exit(1) sys.stderr.write(" Found {} samples.\n".format(len(samples))) outvcfs = [] logfh = sys.stderr if args.logdir: logfile = os.path.basename(vcffile).replace('.vcf','') +\ ".separate_vcf.log" logfile = os.path.join(args.logdir, logfile) logfh = open(logfile, 'w') for sample in samples: outvcf = "{}.separated.vcf".format(sample) if outdir: outvcf = os.path.join(outdir, outvcf) run_select_variants(vcffile, outvcf, sample, ref, logfh, args) if args.logdir: logfh.close() return samples
def parse_vcf(vcffile): """Parse variants in joint vcf file.""" sys.stderr.write("VCF file {} -".format(vcffile)) fh = open_file(vcffile) lines = fh.readlines() fh.close() while lines: # skip vcf header if not lines[0].startswith('##'): break lines.pop(0) fields = lines.pop(0).lstrip('#').rstrip().split("\t") variants = [] for line in lines: vals = line.rstrip().split("\t") d = dict(zip(fields, vals)) variants.append(d) sys.stderr.write(" {} variants\n".format(len(variants))) return (fields, variants)
def parse_vcf(vcffile): fh = open_file(vcffile) lines = fh.readlines() fh.close() header = [] while lines: # vcf header if lines[0].startswith('#'): header.append(lines.pop(0)) else: break fields = header[-1].lstrip('#').rstrip().split("\t") vcfinfo = {} for line in lines: vals = line.rstrip().split("\t") d = dict(zip(fields, vals)) poskey = create_pos_key(d['CHROM'], d['POS']) if poskey in vcfinfo: sys.stderr.write(" Variant pos {} duplicated\n".format(poskey)) vcfinfo[poskey] = d return (header, fields, vcfinfo)
def parse_bedfile(bedfile): """Parse ROI from bed file.""" sys.stderr.write("\nReading {}\n".format(bedfile)) fh = open_file(bedfile) lines = fh.readlines() fh.close() roi = defaultdict(dict) fields = ['CHROM', 'START', 'END', 'NAME'] numlines = 0 for line in lines: vals = line.rstrip().split("\t") d = dict(zip(fields, vals)) d['START'] = int(d['START']) + 1 # convert to 1-based d['END'] = int(d['END']) roi[d['CHROM']][d['START']] = d numlines += 1 sys.stderr.write(" Got {} lines in {} chrom\n".format(numlines, len(roi.keys()))) return roi
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh, args): """Returns trimmed fastq file and file with list of sequence names""" trimmedfq = outlabel + ".trimmed.fastq" seqfile = outlabel + ".seqlist.txt" logfh.write(" Trimming fq: {}\n".format(trimmedfq)) if have_files([trimmedfq, seqfile], args.force, stderr=logfh): logfh.write(" Already have {}\n".format(trimmedfq)) return (trimmedfq, seqfile) aligns = parse_alignout(alignout) seqlist = [] with open(trimmedfq, 'w') as outfq: inseq = FastQParser(fqfile) for seqrec in inseq: seqlist.append(seqrec.id) if seqrec.id in aligns: primer = aligns[seqrec.id]['primer'] if primerinfo[primer]['overlap']: primerend = aligns[seqrec.id]['end'] +\ aligns[seqrec.id]['left'] subrec = seqrec[primerend:] if args.debug: logfh.write("{}\tTrimming\t{}\n".format( primer, seqrec.id)) else: if args.debug: logfh.write("{}\tNot trimming\t{}\n".format( primer, seqrec.id)) subrec = seqrec else: #trim default max_primer_len+2 subrec = seqrec[max_trim_len:] outfq.write("{}\n".format(subrec.fastq())) logfh.write(" Seq list: {}\n".format(seqfile)) if have_file(seqfile, True, stderr=logfh): logfh.write(" Still have {}\n".format(seqfile)) sys.exit() with open_file(seqfile, 'w') as ifh: ifh.write("\n".join(seqlist) + "\n") return (trimmedfq, seqfile)
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh, args): """Returns trimmed fastq file and file with list of sequence names""" trimmedfq = outlabel + ".trimmed.fastq" seqfile = outlabel + ".seqlist.txt" logfh.write(" Trimming fq: {}\n".format(trimmedfq)) if have_files([trimmedfq, seqfile], args.force, stderr=logfh): logfh.write(" Already have {}\n".format(trimmedfq)) return (trimmedfq, seqfile) aligns = parse_alignout(alignout) seqlist = [] with open(trimmedfq, 'w') as outfq: inseq = FastQParser(fqfile) for seqrec in inseq: seqlist.append(seqrec.id) if seqrec.id in aligns: primer = aligns[seqrec.id]['primer'] if primerinfo[primer]['overlap']: primerend = aligns[seqrec.id]['end'] +\ aligns[seqrec.id]['left'] subrec = seqrec[primerend:] if args.debug: logfh.write("{}\tTrimming\t{}\n".format( primer, seqrec.id)) else: if args.debug: logfh.write("{}\tNot trimming\t{}\n".format( primer, seqrec.id)) subrec = seqrec else: #trim default max_primer_len+2 subrec = seqrec[max_trim_len:] outfq.write("{}\n".format(subrec.fastq())) logfh.write(" Seq list: {}\n".format(seqfile)) if have_file(seqfile, True, stderr=logfh): logfh.write(" Still have {}\n".format(seqfile)) sys.exit() with open_file(seqfile, 'w') as ifh: ifh.write("\n".join(seqlist)+"\n") return (trimmedfq, seqfile)