def __init__(self, fileID, path, build, attributes=[]): self.fileID=fileID self.path=path self.format=os.path.splitext(path)[1].lower() if self.format == ".vcf": self.fileAttributes = variantFile.extractVcfFileInfo(path) elif self.format == ".csv": self.fileAttributes = variantFile.extractCsvFileInfo(path) elif self.format == ".bed": raise Exception('bed not supported yet') #self.fileAttributes = featureFile.extractBedFileInfo(path) elif self.format == ".gff3": raise Exception('gff3 not supported yet') else: raise Exception("%s format not supported" % self.format) self.build=build self.attributes=attributes self.hardFilters = {} self.forcedCategoricals = set() for a in self.attributes: self.hardFilters[a.attributeID] = a.hardFilter if a.forceCategorical: self.forcedCategoricals.add(a.attributeID)
def runApp(mask="",m="",vcf="",v="",out="",o="",csv="",c=""): print "loading features", params = featureLoadingParameters(build=genomeUtils.hg19, passFunction=None, rejectFunction=None, callbackArgs={}, tickFunction=tick, tickInterval=10, mask=None, attributesToInclude={}, returnFileObject=True) features = featureFile.parseBedFile(mask, params) print "" variants = {} fileAttributes = variantFile.extractVcfFileInfo(vcf) def addVariant(v): variants[v.genomePosition] = v params = variantLoadingParameters(build=genomeUtils.hg19, passFunction=addVariant, rejectFunction=None, callbackArgs={}, tickFunction=tick, tickInterval=10, individualsToInclude=None, individualAppendString="", lociToInclude=None, mask=None, #features.regions, invertMask=False, #True attributesToInclude=None, attributeAppendString="", skipGenotypeAttributes=True, # TODO: there's actually a bug here (you should turn this back off returnFileObject=False, alleleMatching=allele.UNENFORCED, attemptRepairsWhenComparing=True) print "parsing vcf file", variantFile.parseVcfFile(vcf, params) print "" # TODO: throw this bit out print "swapping attributes" for v in variants.itervalues(): newName = v.attributes.get('RSID',v.name) if v.attributes.has_key('RSID'): newName = v.attributes['RSID'] del v.attributes['RSID'] else: newName = v.name if isinstance(newName,list): newName = newName[1] #if newName.startswith('dbsnp'): # newName = newName.split(':')[1] if newName == '.': newName = v.basicName v.name = newName print "calculating frequencies", tickInterval = len(variants) / 10 i = 0 nextTick = tickInterval groupDict = { "CASES3":set(["T2DG0300147", "T2DG0300160", "T2DG0300135", "T2DG0300133", "T2DG0300143"]), "CASES6":set(["T2DG0600449", "T2DG0600426", "T2DG0600428", "T2DG0600470", "T2DG0600442", "T2DG0600431"]), "CASES20":set(["T2DG2000900", "T2DG2000901", "T2DG2000904", "T2DG2000928"]), "CASES21":set(["T2DG2100946", "T2DG2100955", "T2DG2100967", "T2DG2100966"]), "CONTROLS_IND_EXTREME":set(["T2DG0200013", "T2DG0200027", "T2DG0500309", "T2DG0701163", "T2DG0701156", "T2DG0800488", "T2DG0901234", "T2DG1000568", "T2DG1000570", "T2DG1101324", "T2DG1600768", "T2DG2701070", "T2DG4701118", "T2DG0200068", "T2DG0200071", "T2DG0400250", "T2DG0500349", "T2DG0500339", "T2DG0701179", "T2DG0800504", "T2DG0800564", "T2DG0901289", "T2DG0901275", "T2DG1000595", "T2DG1101348", "T2DG1600785", "T2DG1700875", "T2DG2701094", "T2DG4701124"]), "CONTROL_Normal_nonind":set(["T2DG0200098", "T2DG0200076", "T2DG0200104", "T2DG0200065", "T2DG0400287", "T2DG0400260", "T2DG0400280", "T2DG0400288", "T2DG0400267", "T2DG0400269", "T2DG0400279", "T2DG0400256", "T2DG0400273", "T2DG0500370", "T2DG0500367", "T2DG0500353", "T2DG0500383", "T2DG0500362", "T2DG0500364", "T2DG0500379", "T2DG0500351", "T2DG0500381", "T2DG0500385", "T2DG0500357", "T2DG0701225", "T2DG0701227", "T2DG0701194", "T2DG0701196", "T2DG0701195", "T2DG0701211", "T2DG0701222", "T2DG0701220", "T2DG0701204", "T2DG0701208", "T2DG0701192", "T2DG0701191", "T2DG0701198", "T2DG0701181", "T2DG0701174", "T2DG0800561", "T2DG0800542", "T2DG0800563", "T2DG0800547", "T2DG0800559", "T2DG0901306", "T2DG0901296", "T2DG0901285", "T2DG0901284", "T2DG0901269", "T2DG0901295", "T2DG0901299", "T2DG0901279", "T2DG0901307", "T2DG1000637", "T2DG1000629", "T2DG1000630", "T2DG1000614", "T2DG1000612", "T2DG1000613", "T2DG1000631", "T2DG1000591", "T2DG1000620", "T2DG1000640", "T2DG1000606", "T2DG1000636", "T2DG1000638", "T2DG1000627", "T2DG1101369", "T2DG1101381", "T2DG1101383", "T2DG1101377", "T2DG1101354", "T2DG1101356", "T2DG1600793", "T2DG1600811", "T2DG1600816", "T2DG1600810", "T2DG1700872", "T2DG1700869", "T2DG1700876", "T2DG1700867", "T2DG2701096", "T2DG2701093", "T2DG4701139", "T2DG4701129"]), "CONTROLS_PreHypertension_ind":set(["T2DG0200073", "T2DG0200031", "T2DG0200040", "T2DG0200032", "T2DG0200042", "T2DG0200047", "T2DG0200070", "T2DG0200078", "T2DG0200096", "T2DG0200063", "T2DG0200077", "T2DG0200057", "T2DG0200086", "T2DG0200041", "T2DG0400234", "T2DG0400243", "T2DG0400257", "T2DG0400261", "T2DG0400264", "T2DG0400241", "T2DG0400237", "T2DG0400238", "T2DG0400258", "T2DG0400295", "T2DG0400254", "T2DG0400262", "T2DG0500334", "T2DG0500346", "T2DG0500358", "T2DG0500371", "T2DG0500389", "T2DG0500332", "T2DG0500352", "T2DG0500375", "T2DG0500347", "T2DG0500388", "T2DG0500373", "T2DG0701188", "T2DG0701219", "T2DG0701203", "T2DG0701199", "T2DG0701216", "T2DG0701217", "T2DG0701214", "T2DG0800541", "T2DG0800520", "T2DG0800552", "T2DG0800529", "T2DG0800514", "T2DG0800502", "T2DG0800505", "T2DG0800509", "T2DG0901305", "T2DG0901287", "T2DG0901267", "T2DG0901271", "T2DG0901270", "T2DG0901308", "T2DG0901288", "T2DG0901298", "T2DG0901278", "T2DG0901272", "T2DG0901263", "T2DG1000599", "T2DG1000618", "T2DG1000597", "T2DG1000611", "T2DG1000616", "T2DG1000592", "T2DG1000598", "T2DG1000604", "T2DG1000642", "T2DG1000639", "T2DG1000607", "T2DG1101365", "T2DG1101366", "T2DG1101382", "T2DG1101388", "T2DG1101389", "T2DG1101372", "T2DG1101384", "T2DG1101385", "T2DG1101338", "T2DG1101341", "T2DG1101343", "T2DG1101387", "T2DG1101390", "T2DG1101344", "T2DG1101342", "T2DG1600812", "T2DG1600819", "T2DG1600805", "T2DG1600804", "T2DG1600807", "T2DG1600799", "T2DG1700861", "T2DG1700846", "T2DG1700854", "T2DG1700853", "T2DG1700868", "T2DG1700870", "T2DG2701110", "T2DG2701085", "T2DG2701088", "T2DG2701111", "T2DG2701107", "T2DG2701091", "T2DG4701130", "T2DG4701122", "T2DG4701133", "T2DG4701127", "T2DG4701128", "T2DG0200006", "T2DG0200008", "T2DG0200012", "T2DG0200009", "T2DG0200018", "T2DG0200023", "T2DG0200007", "T2DG0400219", "T2DG0500318", "T2DG0500327", "T2DG0500310", "T2DG0500312", "T2DG0500313", "T2DG0701164", "T2DG0701143", "T2DG0800490", "T2DG0800498", "T2DG0901251", "T2DG1000567", "T2DG1000582", "T2DG1000586", "T2DG1000565", "T2DG1000566", "T2DG1000569", "T2DG1101320", "T2DG1101330", "T2DG1600767", "T2DG1600773", "T2DG1600778", "T2DG1600771", "T2DG1700824", "T2DG1700836", "T2DG2701073", "T2DG2701079"]), "CONTROLS_ALL":set(["T2DG0200013", "T2DG0200027", "T2DG0500309", "T2DG0701163", "T2DG0701156", "T2DG0800488", "T2DG0901234", "T2DG1000568", "T2DG1000570", "T2DG1101324", "T2DG1600768", "T2DG2701070", "T2DG4701118", "T2DG0200068", "T2DG0200071", "T2DG0400250", "T2DG0500349", "T2DG0500339", "T2DG0701179", "T2DG0800504", "T2DG0800564", "T2DG0901289", "T2DG0901275", "T2DG1000595", "T2DG1101348", "T2DG1600785", "T2DG1700875", "T2DG2701094", "T2DG4701124", "T2DG0200098", "T2DG0200076", "T2DG0200104", "T2DG0200065", "T2DG0400287", "T2DG0400260", "T2DG0400280", "T2DG0400288", "T2DG0400267", "T2DG0400269", "T2DG0400279", "T2DG0400256", "T2DG0400273", "T2DG0500370", "T2DG0500367", "T2DG0500353", "T2DG0500383", "T2DG0500362", "T2DG0500364", "T2DG0500379", "T2DG0500351", "T2DG0500381", "T2DG0500385", "T2DG0500357", "T2DG0701225", "T2DG0701227", "T2DG0701194", "T2DG0701196", "T2DG0701195", "T2DG0701211", "T2DG0701222", "T2DG0701220", "T2DG0701204", "T2DG0701208", "T2DG0701192", "T2DG0701191", "T2DG0701198", "T2DG0701181", "T2DG0701174", "T2DG0800561", "T2DG0800542", "T2DG0800563", "T2DG0800547", "T2DG0800559", "T2DG0901306", "T2DG0901296", "T2DG0901285", "T2DG0901284", "T2DG0901269", "T2DG0901295", "T2DG0901299", "T2DG0901279", "T2DG0901307", "T2DG1000637", "T2DG1000629", "T2DG1000630", "T2DG1000614", "T2DG1000612", "T2DG1000613", "T2DG1000631", "T2DG1000591", "T2DG1000620", "T2DG1000640", "T2DG1000606", "T2DG1000636", "T2DG1000638", "T2DG1000627", "T2DG1101369", "T2DG1101381", "T2DG1101383", "T2DG1101377", "T2DG1101354", "T2DG1101356", "T2DG1600793", "T2DG1600811", "T2DG1600816", "T2DG1600810", "T2DG1700872", "T2DG1700869", "T2DG1700876", "T2DG1700867", "T2DG2701096", "T2DG2701093", "T2DG4701139", "T2DG4701129", "T2DG0200073", "T2DG0200031", "T2DG0200040", "T2DG0200032", "T2DG0200042", "T2DG0200047", "T2DG0200070", "T2DG0200078", "T2DG0200096", "T2DG0200063", "T2DG0200077", "T2DG0200057", "T2DG0200086", "T2DG0200041", "T2DG0400234", "T2DG0400243", "T2DG0400257", "T2DG0400261", "T2DG0400264", "T2DG0400241", "T2DG0400237", "T2DG0400238", "T2DG0400258", "T2DG0400295", "T2DG0400254", "T2DG0400262", "T2DG0500334", "T2DG0500346", "T2DG0500358", "T2DG0500371", "T2DG0500389", "T2DG0500332", "T2DG0500352", "T2DG0500375", "T2DG0500347", "T2DG0500388", "T2DG0500373", "T2DG0701188", "T2DG0701219", "T2DG0701203", "T2DG0701199", "T2DG0701216", "T2DG0701217", "T2DG0701214", "T2DG0800541", "T2DG0800520", "T2DG0800552", "T2DG0800529", "T2DG0800514", "T2DG0800502", "T2DG0800505", "T2DG0800509", "T2DG0901305", "T2DG0901287", "T2DG0901267", "T2DG0901271", "T2DG0901270", "T2DG0901308", "T2DG0901288", "T2DG0901298", "T2DG0901278", "T2DG0901272", "T2DG0901263", "T2DG1000599", "T2DG1000618", "T2DG1000597", "T2DG1000611", "T2DG1000616", "T2DG1000592", "T2DG1000598", "T2DG1000604", "T2DG1000642", "T2DG1000639", "T2DG1000607", "T2DG1101365", "T2DG1101366", "T2DG1101382", "T2DG1101388", "T2DG1101389", "T2DG1101372", "T2DG1101384", "T2DG1101385", "T2DG1101338", "T2DG1101341", "T2DG1101343", "T2DG1101387", "T2DG1101390", "T2DG1101344", "T2DG1101342", "T2DG1600812", "T2DG1600819", "T2DG1600805", "T2DG1600804", "T2DG1600807", "T2DG1600799", "T2DG1700861", "T2DG1700846", "T2DG1700854", "T2DG1700853", "T2DG1700868", "T2DG1700870", "T2DG2701110", "T2DG2701085", "T2DG2701088", "T2DG2701111", "T2DG2701107", "T2DG2701091", "T2DG4701130", "T2DG4701122", "T2DG4701133", "T2DG4701127", "T2DG4701128", "T2DG0200006", "T2DG0200008", "T2DG0200012", "T2DG0200009", "T2DG0200018", "T2DG0200023", "T2DG0200007", "T2DG0400219", "T2DG0500318", "T2DG0500327", "T2DG0500310", "T2DG0500312", "T2DG0500313", "T2DG0701164", "T2DG0701143", "T2DG0800490", "T2DG0800498", "T2DG0901251", "T2DG1000567", "T2DG1000582", "T2DG1000586", "T2DG1000565", "T2DG1000566", "T2DG1000569", "T2DG1101320", "T2DG1101330", "T2DG1600767", "T2DG1600773", "T2DG1600778", "T2DG1600771", "T2DG1700824", "T2DG1700836", "T2DG2701073", "T2DG2701079"])} for v in variants.itervalues(): performGroupCalculations(v,groupDict,"CONTROLS_IND_EXTREME",mode=-1) i += 1 if i > nextTick: nextTick += tickInterval print ".", print "" del fileAttributes['INDIVIDUALS'] del fileAttributes['INFO']['RSID'] for k in groupDict.iterkeys(): fileAttributes['INFO'][k] = {"ID":k,"Number":1,"Type":"Float","Description":"%s Allele frequency"%k} print "loading .csv file", acceptAllFilter = valueFilter() csvDict = {"SIFT_score":acceptAllFilter, "LRT_score":acceptAllFilter, "MutationTaster_pred":acceptAllFilter, "phyloP":acceptAllFilter, "SLR_test_statistic":acceptAllFilter, "LRT_pred":acceptAllFilter, "MutationTaster_score":acceptAllFilter, "MutationTaster_pred":acceptAllFilter, "GERP++_NR":acceptAllFilter, "GERP++_RS":acceptAllFilter, "phyloP":acceptAllFilter, "29way_logOdds":acceptAllFilter, "LRT_Omega":acceptAllFilter, "1000Gp1_AF":acceptAllFilter} for k in csvDict.iterkeys(): fileAttributes['INFO'][k] = {"ID":k,"Number":1,"Type":"Float","Description":k} def tryRepair(v): if variants.has_key(v.genomePosition): for key,value in v.attributes.iteritems(): variants[v.genomePosition].setAttribute(key,value) params = variantLoadingParameters(build=genomeUtils.hg19, passFunction=tryRepair, rejectFunction=None, callbackArgs={}, tickFunction=tick, tickInterval=10, individualsToInclude=None, individualAppendString="", lociToInclude=None, mask=None, #features.regions, invertMask=False, #True, attributesToInclude=csvDict, attributeAppendString="", skipGenotypeAttributes=True, returnFileObject=False, alleleMatching=allele.UNENFORCED, attemptRepairsWhenComparing=True) variantFile.parseCsvFile(csv,params) print "" print "writing file..." temp = variantFile(fileAttributes) temp.variants = variants.values() temp.writeVcfFile(out, sortMethod="NUMXYM", includeScriptLine=True) print "" print "done"
def runApp(loci="",l="",vcf="",v="",individuals="",i="",out="",o="",remove="",r=""): lociToKeep = set() def tick(): print ".", def add(v): if lociToKeep != None: lociToKeep.add(v) if individuals != None: print "Parsing individual list..." individualList = [] infile = open(individuals,'r') for line in infile: individualList.append(line.strip()) infile.close() else: individualList = variantFile.extractVcfFileInfo(vcf)["INDIVIDUALS"] individualList.sort() if loci != None: print "Parsing loci list..." firstLine = True infile = open(loci,'r') for line in infile: if firstLine: firstLine = False continue columns = line.split() lociToKeep.add(variant(chromosome=columns[0], position=columns[1], matchMode=allele.UNENFORCED, attemptRepairsWhenComparing=True, ref=".*", alt=".*", name=columns[2], build=genomeUtils.hg19, attributeFilters=None)) infile.close() else: lociToKeep = None if not isinstance(vcf,list): vcf = [vcf] for infile in vcf: print "Parsing %s" % infile # we only care about genotypes; we throw out all other details. parseParameters = variantLoadingParameters( passFunction=add, tickFunction=tick, tickInterval=5, individualsToInclude=individualList, alleleMatching = allele.UNENFORCED, lociToInclude=lociToKeep, attributesToInclude={}, skipGenotypeAttributes=True) if lociToKeep == None: parseParameters.returnFileObject = True resultFile = variantFile.parseVcfFile(infile,parseParameters) print "" print "Writing results..." outfile = open(out,'w') outfile.write("Chromosome\tPosition\tRs#\tMajor\tMinor") for i in individualList: outfile.write("\t%s" % i) outfile.write("\n") if remove != None: removeFile = open(remove,'w') removeFile.write("Chromosome\tPosition\tRs#\tReason\n") if lociToKeep != None: lociToKeep = sorted(lociToKeep, key=lambda v: v.name) else: lociToKeep = sorted(resultFile.variants, key=lambda v: v.name) for v in lociToKeep: if len(v.genotypes) == 0: if remove != None: removeFile.write("%s\t%i\t%s\tNo Genotypes\n" % (v.chromosome,v.position,v.name)) elif len(v.alleles) < 2: if remove != None: removeFile.write("%s\t%i\t%s\tNot enough alleles" % (v.chromosome,v.position,v.name)) else: outfile.write("%s\t%i\t%s\t%s\t%s" % (v.chromosome,v.position,v.name,v.alleles[0],v.alleles[1])) for i in individualList: a1 = "_" a2 = "_" if v.genotypes.has_key(i): g = v.genotypes[i] if g.allele1 != None: a1 = str(g.allele1) if g.allele2 != None: a2 = str(g.allele2) outfile.write("\t%s%s" % (a1,a2)) outfile.write("\n") outfile.close() if remove != None: removeFile.close() print "Done."