def filter_vcf(pileup, outfile=None, chr_col=0, ref_col=3, alt_col=4, sep='\t'): """ Removes lines where ALT==REF and chromosomes other than 1 - 22, X, Y and MT""" fh = open(pileup, "r") if outfile is None: outfile=pileup+'.filt' fu.delete(outfile) fh_out = open(outfile, "w") for line in fh: line = line.strip() if line.startswith('#'): fh_out.write(str(line)+'\n') else: fields=line.split(sep) if(len(fields)>=8): chr=str(fields[chr_col]) ref=str(fields[ref_col]) alt=str(fields[alt_col]) if (alt != ref) and (fu.find_first_index(ACCEPTED_CHR, chr.strip()) > -1): fh_out.write(str(line)+'\n')
def filter_vcf(pileup, outfile=None, chr_col=0, ref_col=3, alt_col=4, sep='\t'): """ Removes lines where ALT==REF and chromosomes other than 1 - 22, X, Y and MT""" fh = open(pileup, "r") if outfile is None: outfile = pileup + '.filt' fu.delete(outfile) fh_out = open(outfile, "w") for line in fh: line = line.strip() if line.startswith('#'): fh_out.write(str(line) + '\n') else: fields = line.split(sep) if (len(fields) >= 8): chr = str(fields[chr_col]) ref = str(fields[ref_col]) alt = str(fields[alt_col]) if (alt != ref) and (fu.find_first_index( ACCEPTED_CHR, chr.strip()) > -1): fh_out.write(str(line) + '\n')
def filter_pileup(pileup, outfile=None, chr_col=0, ref_col=2, alt_col=3, sep='\t'): fh = open(pileup, "r") if outfile is None: outfile = pileup + '.vcf' fu.delete(outfile) fh_out = open(outfile, "w") fh_out.write(vcfheader(pileup) + '\n') for line in fh: line = line.strip() fields = line.split(sep) chr = str(fields[chr_col]) ref = str(fields[ref_col]) alt = str(fields[alt_col]) if (alt != ref) and (fu.find_first_index(ACCEPTED_CHR, chr.strip()) > -1): fh_out.write(varpileup_line2vcf_line(fields[0:9]) + '\n')
def filter_vcf(pileup, outfile=None, chr_col=0, ref_col=3, alt_col=4, sep='\t'): fh = open(pileup, "r") if (outfile is None): outfile = pileup + '.filt' fu.delete(outfile) fh_out = open(outfile, "w") for line in fh: line = line.strip() if line.startswith('#'): fh_out.write(str(line) + '\n') else: fields = line.split(sep) if (len(fields) >= 8): chr = str(fields[chr_col]) ref = str(fields[ref_col]) alt = str(fields[alt_col]) if ((alt != ref) and \ (fu.find_first_index(ACCEPTED_CHR, chr.strip()) > -1)): fh_out.write(str(line) + '\n') ### EOF
def filter_pileup(pileup, outfile=None, chr_col=0, ref_col=2, alt_col=3, sep='\t'): fh = open(pileup, "r") if outfile is None: outfile=pileup+'.vcf' fu.delete(outfile) fh_out = open(outfile, "w") fh_out.write(vcfheader(pileup)+'\n') for line in fh: line = line.strip() fields=line.split(sep) chr=str(fields[chr_col]) ref=str(fields[ref_col]) alt=str(fields[alt_col]) if (alt != ref) and (fu.find_first_index(ACCEPTED_CHR, chr.strip()) > -1): fh_out.write(varpileup_line2vcf_line(fields[0:9]) +'\n' )
def run(infile, format): print("Running . . .") ann.getSnpsFromDbSnp(vcf=infile, format='vcf', tmpextin='', tmpextout='.1' ) #print("Done dbSNP") # Set numbering tmpextin=1 tmpextout=2 ann.getBigRefGene(vcf=infile, format='vcf', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("Done BigRefGene ") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.getGenes(vcf=infile, format='vcf', table='refGene', promoter_offset=500, tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("Done RefGene") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithCytoband(vcf=infile, format='vcf', table='cytoBand', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("cytoband ") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithGadAll(vcf=infile, format='vcf', table='gadAll', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("gadAll ") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithGwasCatalog(vcf=infile, format='vcf', table='gwasCatalog', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("GwasCatalog ") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithMiRNA(vcf=infile, format='vcf', table='targetScanS', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("miRNA") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWitHUGOGeneNomenclature(vcf=infile, format='vcf', table='hugo', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("HUGO Gene Nomenclature Committee (HGNC) ") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithCnvDatabase(vcf=infile, format='vcf', table='dgv_Cnv', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("dgv_Cnv") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithCnvDatabase(vcf=infile, format='vcf', table='abParts_IG_T_CelReceptors', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("abParts_IG_T_CelReceptors") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithCnvDatabase(vcf=infile, format='vcf', table='mcCarroll_Cnv', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("mcCarroll_Cnv") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithCnvDatabase(vcf=infile, format='vcf', table='conrad_Cnv', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("conrad_Cnv") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithGenomicSuperDups(vcf=infile, format='vcf', table='genomicSuperDups', tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("genomicSuperDups") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ann.addOverlapWithTfbsConsSites(vcf=infile, table='tfbsConsSites',tmpextin='.'+str(tmpextin), tmpextout='.'+str(tmpextout)) #print("addOverlapWithTfbsConsSites") tmpextin=tmpextin+1 tmpextout=tmpextout+1 ## Cleanup for i in range(1, tmpextin): fu.delete(infile+'.'+ str(i)) os.rename(infile+'.'+str(tmpextin), infile+'.annot') finalout=(infile+'.annot').replace('.vcf.annot', '.annot.vcf') os.rename(infile+'.annot', finalout)