def runApp(mask="",m="",vcf="",v="",out="",o=""):
    print "loading features",
    params = featureLoadingParameters(build=genomeUtils.hg19,
                                     passFunction=None,
                                     rejectFunction=None,
                                     callbackArgs={},
                                     tickFunction=tick,
                                     tickInterval=10,
                                     mask=None,
                                     attributesToInclude={},
                                     returnFileObject=True)
    features = featureFile.parseBedFile(mask, params)
    print ""
    
    params = variantLoadingParameters(build=genomeUtils.hg19,
                                     passFunction=None,
                                     rejectFunction=None,
                                     callbackArgs={},
                                     tickFunction=tick,
                                     tickInterval=10,
                                     individualsToInclude=None,
                                     individualAppendString="",
                                     lociToInclude=None,
                                     mask=features.regions,
                                     invertMask=True,
                                     attributesToInclude=None,
                                     attributeAppendString="",
                                     skipGenotypeAttributes=True,   # TODO: there's actually a bug here (you should turn this back off
                                     returnFileObject=True,
                                     alleleMatching=allele.STRICT,
                                     attemptRepairsWhenComparing=True)
    print "parsing variants",
    newFile = variantFile.parseVcfFile(vcf, params)
    print ""
    # TODO: throw this bit out
    print "swapping attributes"
    for v in newFile.variants:
        newName = v.attributes.get('RSID',v.name)
        if isinstance(newName,list):
            newName = newName[1]
        #if newName.startswith('dbsnp'):
        #    newName = newName.split(':')[1]
        if newName == '.':
            newName = v.basicName
        v.name = newName
    
    print "writing file..."
    newFile.writeVcfFile(out, sortMethod="NUMXYM", includeScriptLine=True)
    print ""
    print "done"
def notifyRun(vcfPath, vcfAttributes, xAttribute, yAttribute, softFilters, forcedCategoricals, featurePaths):
    global canceled, splash, window
    splash = QProgressDialog("Loading %s" % os.path.split(vcfPath)[1], "Cancel", 0, 1000, parent=None)
    splash.setWindowModality(Qt.WindowModal)
    splash.setAutoReset(False)
    splash.setAutoClose(False)
    splash.show()
    canceled = False
    
    vData = variantData(vcfPath, vcfAttributes, forcedCategoricals)
    vParams = variantLoadingParameters(passFunction=vData.addVariant,
                                     rejectFunction=None,
                                     callbackArgs={},
                                     tickFunction=tick,
                                     tickInterval=0.1,
                                     individualsToInclude=[],
                                     individualAppendString="",
                                     lociToInclude=None,
                                     mask=None,
                                     invertMask=False,
                                     attributesToInclude=None,
                                     attributeAppendString="",
                                     skipGenotypeAttributes=True,
                                     returnFileObject=False,
                                     alleleMatching=allele.STRICT,
                                     attemptRepairsWhenComparing=True)
    try:
        variantFile.parseVcfFile(vcfPath, vParams)
    except cancelButtonException:
        splash.close()
        window.window.show()
        return
    
    if softFilters == None:
        softFilters = {}
        for k in vData.axisLookups.iterkeys():
            if k == xAttribute or k == yAttribute:
                if vData.axisLookups[k].hasNumeric:
                    fivePercent = 0.05*(vData.axisLookups[k].maximum-vData.axisLookups[k].minimum)
                    ranges = [(vData.axisLookups[k].maximum-fivePercent,vData.axisLookups[k].maximum)]
                else:
                    ranges = None
                values = []
            else:
                ranges = None
                values = None
            softFilters[k] = valueFilter(values=values,
                                         ranges=ranges,
                                         includeNone=True,
                                         includeBlank=True,
                                         includeInf=True,
                                         includeNaN=True,
                                         includeMissing=True,
                                         includeAlleleMasked=True,
                                         listMode=valueFilter.LIST_INCLUSIVE)
    intMan = interactionManager(vData,softFilters)
    
    # TODO
    fData = featureData(featurePaths)
    if canceled:
        splash.close()
        window.window.show()
        return

    splash.close()
    appWindow = appWidget(vData,fData,intMan,xAttribute,yAttribute)
    intMan.setApp(appWindow)
Esempio n. 3
0
 def loadDataObjects(self, callback=None):
     # Try to parse files in order by format (some files are more informative than others,
     # and we want to start off with the best information) ... this may get shaken up
     # when we support masking/specific loci
     gff3Files = []
     bedFiles = []
     
     vcfFiles = []
     csvFiles = []
     axisLabels = set()
     forcedCategoricals = set()
     individualsToInclude = self.getAllSamples()
     
     for fileID,f in self.files.iteritems():
         if f.format == '.vcf':
             vcfFiles.append(fileID)
             axisLabels.update(f.hardFilters.iterkeys())
             forcedCategoricals.update(f.forcedCategoricals)
         elif f.format == '.csv':
             csvFiles.append(fileID)
             axisLabels.update(f.hardFilters.iterkeys())
             forcedCategoricals.update(f.forcedCategoricals)
         elif f.format == '.gff3':
             gff3Files.append(fileID)
         elif f.format == '.bed':
             bedFiles.append(fileID)
     
     vData = tempVariantData(axisLabels,set(self.statistics.iterkeys()),forcedCategoricals,self.startingXaxis,self.startingYaxis)
     
     for fileID in gff3Files:
         pass
     for fileID in bedFiles:
         pass
     for fileID in vcfFiles:
         if callback != None:
             callback(numTicks=0,message='Loading %s' % fileID)
         parameters = variantLoadingParameters(build=self.files[fileID].build,
                                             passFunction=vData.addVariant,
                                             rejectFunction=None,
                                             callbackArgs={},
                                             tickFunction=callback,
                                             tickInterval=self.loadingPercentages[fileID],
                                             individualsToInclude=individualsToInclude,
                                             individualAppendString=" (%s)" % fileID,
                                             lociToInclude=None, # TODO: support masking and specfic loci
                                             mask=None,
                                             attributesToInclude=self.files[fileID].hardFilters,
                                             attributeAppendString=" (%s)" % fileID,
                                             skipGenotypeAttributes=True)
         if variantFile.parseVcfFile(self.files[fileID].path, parameters) == "ABORTED":
             return (None,None)
     
     for fileID in csvFiles:
         if callback != None:
             callback(numTicks=0,message='Loading %s' % fileID)
         parameters = variantLoadingParameters(build=self.files[fileID].build,
                                             passFunction=vData.addVariant,
                                             rejectFunction=None,
                                             callbackArgs={},
                                             tickFunction=callback,
                                             tickInterval=self.loadingPercentages[fileID],
                                             individualsToInclude=[],    # .csv files don't have genotypes
                                             individualAppendString="",
                                             lociToInclude=None, # TODO: support masking and specfic loci
                                             mask=None,
                                             attributesToInclude=self.files[fileID].hardFilters,
                                             attributeAppendString=" (%s)" % fileID,
                                             skipGenotypeAttributes=True)
         if variantFile.parseCsvFile(self.files[fileID].path, parameters) == "ABORTED":
             return (None,None)
     
     # Now that we've loaded the files, do our group calculations
     callback(numTicks=0,message='Calculating group statistics')
     vData.performGroupCalculations(self.groups, self.statistics, callback, self.loadingPercentages[None])
     
     return vData
def runApp(loci="", l="", vcf="", v="", individuals="", i="", out="", o="", remove="", r=""):
    if individuals != None:
        print "Parsing individual list..."
        individualList = []
        infile = open(individuals, "r")
        for line in infile:
            individualList.append(line.strip())
        infile.close()

    else:
        individualList = None

    if loci != None:
        print "Parsing loci list..."
        firstLine = True
        infile = open(loci, "r")
        for line in infile:
            if firstLine:
                firstLine = False
                continue
            columns = line.split()
            lociToKeep.add(
                variant(
                    chromosome=columns[0],
                    position=columns[1],
                    matchMode=allele.FLEXIBLE,
                    attemptRepairsWhenComparing=True,
                    ref=".*",
                    alt=".*",
                    name=columns[2],
                    build=genomeUtils.hg19,
                    attributeFilters=None,
                )
            )
        infile.close()

    print "Parsing .vcf file..."
    # we only care about genotypes; we throw out all other details.
    parseParameters = variantLoadingParameters(
        passFunction=add,
        tickFunction=tick,
        tickInterval=5,
        individualsToInclude=individualList,
        lociToInclude=lociToKeep,
        attributesToInclude={},
        skipGenotypeAttributes=True,
    )

    variantFile.parseVcfFile(vcf, parseParameters)
    print ""

    print "Writing results..."
    outfile = open(out, "w")
    outfile.write("Chromosome\tPosition\tRs#")
    for i in individualList:
        outfile.write("\t%s" % i)
    outfile.write("\n")

    if remove != None:
        removeFile = open(remove, "w")
        removeFile.write("Chromosome\tPosition\tRs#\tReason\n")

    for v in sorted(lociToKeep, key=lambda x: x.position):
        if remove != None and len(v.genotypes) == 0:
            removeFile.write("%s\t%i\t%s\tNo Genotypes\n" % (v.chromosome, v.position, v.name))
        else:
            outfile.write("%s\t%i\t%s" % (v.chromosome, v.position, v.name))
            for i in individualList:
                outfile.write("\t%s" % str(v.genotypes.get(i, "./.")))
            outfile.write("\n")
    outfile.close()
    if remove != None:
        removeFile.close()
    print "Done."
def runApp(mask="",m="",vcf="",v="",out="",o="",csv="",c=""):
    print "loading features",
    params = featureLoadingParameters(build=genomeUtils.hg19,
                                     passFunction=None,
                                     rejectFunction=None,
                                     callbackArgs={},
                                     tickFunction=tick,
                                     tickInterval=10,
                                     mask=None,
                                     attributesToInclude={},
                                     returnFileObject=True)
    features = featureFile.parseBedFile(mask, params)
    print ""
    
    variants = {}
    fileAttributes = variantFile.extractVcfFileInfo(vcf)
    
    def addVariant(v):
        variants[v.genomePosition] = v
    
    params = variantLoadingParameters(build=genomeUtils.hg19,
                                     passFunction=addVariant,
                                     rejectFunction=None,
                                     callbackArgs={},
                                     tickFunction=tick,
                                     tickInterval=10,
                                     individualsToInclude=None,
                                     individualAppendString="",
                                     lociToInclude=None,
                                     mask=None,  #features.regions,
                                     invertMask=False,  #True
                                     attributesToInclude=None,
                                     attributeAppendString="",
                                     skipGenotypeAttributes=True,   # TODO: there's actually a bug here (you should turn this back off
                                     returnFileObject=False,
                                     alleleMatching=allele.UNENFORCED,
                                     attemptRepairsWhenComparing=True)
    print "parsing vcf file",
    variantFile.parseVcfFile(vcf, params)
    print ""
    # TODO: throw this bit out
    print "swapping attributes"
    for v in variants.itervalues():
        newName = v.attributes.get('RSID',v.name)
        if v.attributes.has_key('RSID'):
            newName = v.attributes['RSID']
            del v.attributes['RSID']
        else:
            newName = v.name
        if isinstance(newName,list):
            newName = newName[1]
        #if newName.startswith('dbsnp'):
        #    newName = newName.split(':')[1]
        if newName == '.':
            newName = v.basicName
        v.name = newName
    print "calculating frequencies",
    tickInterval = len(variants) / 10
    i = 0
    nextTick = tickInterval
    groupDict = {   "CASES3":set(["T2DG0300147", "T2DG0300160", "T2DG0300135", "T2DG0300133", "T2DG0300143"]),
                    "CASES6":set(["T2DG0600449", "T2DG0600426", "T2DG0600428", "T2DG0600470", "T2DG0600442", "T2DG0600431"]),
                    "CASES20":set(["T2DG2000900", "T2DG2000901", "T2DG2000904", "T2DG2000928"]),
                    "CASES21":set(["T2DG2100946", "T2DG2100955", "T2DG2100967", "T2DG2100966"]),
                    "CONTROLS_IND_EXTREME":set(["T2DG0200013", "T2DG0200027", "T2DG0500309", "T2DG0701163", "T2DG0701156", "T2DG0800488", "T2DG0901234", "T2DG1000568", "T2DG1000570", "T2DG1101324", "T2DG1600768", "T2DG2701070", "T2DG4701118", "T2DG0200068", "T2DG0200071", "T2DG0400250", "T2DG0500349", "T2DG0500339", "T2DG0701179", "T2DG0800504", "T2DG0800564", "T2DG0901289", "T2DG0901275", "T2DG1000595", "T2DG1101348", "T2DG1600785", "T2DG1700875", "T2DG2701094", "T2DG4701124"]),
                    "CONTROL_Normal_nonind":set(["T2DG0200098", "T2DG0200076", "T2DG0200104", "T2DG0200065", "T2DG0400287", "T2DG0400260", "T2DG0400280", "T2DG0400288", "T2DG0400267", "T2DG0400269", "T2DG0400279", "T2DG0400256", "T2DG0400273", "T2DG0500370", "T2DG0500367", "T2DG0500353", "T2DG0500383", "T2DG0500362", "T2DG0500364", "T2DG0500379", "T2DG0500351", "T2DG0500381", "T2DG0500385", "T2DG0500357", "T2DG0701225", "T2DG0701227", "T2DG0701194", "T2DG0701196", "T2DG0701195", "T2DG0701211", "T2DG0701222", "T2DG0701220", "T2DG0701204", "T2DG0701208", "T2DG0701192", "T2DG0701191", "T2DG0701198", "T2DG0701181", "T2DG0701174", "T2DG0800561", "T2DG0800542", "T2DG0800563", "T2DG0800547", "T2DG0800559", "T2DG0901306", "T2DG0901296", "T2DG0901285", "T2DG0901284", "T2DG0901269", "T2DG0901295", "T2DG0901299", "T2DG0901279", "T2DG0901307", "T2DG1000637", "T2DG1000629", "T2DG1000630", "T2DG1000614", "T2DG1000612", "T2DG1000613", "T2DG1000631", "T2DG1000591", "T2DG1000620", "T2DG1000640", "T2DG1000606", "T2DG1000636", "T2DG1000638", "T2DG1000627", "T2DG1101369", "T2DG1101381", "T2DG1101383", "T2DG1101377", "T2DG1101354", "T2DG1101356", "T2DG1600793", "T2DG1600811", "T2DG1600816", "T2DG1600810", "T2DG1700872", "T2DG1700869", "T2DG1700876", "T2DG1700867", "T2DG2701096", "T2DG2701093", "T2DG4701139", "T2DG4701129"]),
                    "CONTROLS_PreHypertension_ind":set(["T2DG0200073", "T2DG0200031", "T2DG0200040", "T2DG0200032", "T2DG0200042", "T2DG0200047", "T2DG0200070", "T2DG0200078", "T2DG0200096", "T2DG0200063", "T2DG0200077", "T2DG0200057", "T2DG0200086", "T2DG0200041", "T2DG0400234", "T2DG0400243", "T2DG0400257", "T2DG0400261", "T2DG0400264", "T2DG0400241", "T2DG0400237", "T2DG0400238", "T2DG0400258", "T2DG0400295", "T2DG0400254", "T2DG0400262", "T2DG0500334", "T2DG0500346", "T2DG0500358", "T2DG0500371", "T2DG0500389", "T2DG0500332", "T2DG0500352", "T2DG0500375", "T2DG0500347", "T2DG0500388", "T2DG0500373", "T2DG0701188", "T2DG0701219", "T2DG0701203", "T2DG0701199", "T2DG0701216", "T2DG0701217", "T2DG0701214", "T2DG0800541", "T2DG0800520", "T2DG0800552", "T2DG0800529", "T2DG0800514", "T2DG0800502", "T2DG0800505", "T2DG0800509", "T2DG0901305", "T2DG0901287", "T2DG0901267", "T2DG0901271", "T2DG0901270", "T2DG0901308", "T2DG0901288", "T2DG0901298", "T2DG0901278", "T2DG0901272", "T2DG0901263", "T2DG1000599", "T2DG1000618", "T2DG1000597", "T2DG1000611", "T2DG1000616", "T2DG1000592", "T2DG1000598", "T2DG1000604", "T2DG1000642", "T2DG1000639", "T2DG1000607", "T2DG1101365", "T2DG1101366", "T2DG1101382", "T2DG1101388", "T2DG1101389", "T2DG1101372", "T2DG1101384", "T2DG1101385", "T2DG1101338", "T2DG1101341", "T2DG1101343", "T2DG1101387", "T2DG1101390", "T2DG1101344", "T2DG1101342", "T2DG1600812", "T2DG1600819", "T2DG1600805", "T2DG1600804", "T2DG1600807", "T2DG1600799", "T2DG1700861", "T2DG1700846", "T2DG1700854", "T2DG1700853", "T2DG1700868", "T2DG1700870", "T2DG2701110", "T2DG2701085", "T2DG2701088", "T2DG2701111", "T2DG2701107", "T2DG2701091", "T2DG4701130", "T2DG4701122", "T2DG4701133", "T2DG4701127", "T2DG4701128", "T2DG0200006", "T2DG0200008", "T2DG0200012", "T2DG0200009", "T2DG0200018", "T2DG0200023", "T2DG0200007", "T2DG0400219", "T2DG0500318", "T2DG0500327", "T2DG0500310", "T2DG0500312", "T2DG0500313", "T2DG0701164", "T2DG0701143", "T2DG0800490", "T2DG0800498", "T2DG0901251", "T2DG1000567", "T2DG1000582", "T2DG1000586", "T2DG1000565", "T2DG1000566", "T2DG1000569", "T2DG1101320", "T2DG1101330", "T2DG1600767", "T2DG1600773", "T2DG1600778", "T2DG1600771", "T2DG1700824", "T2DG1700836", "T2DG2701073", "T2DG2701079"]),
                    "CONTROLS_ALL":set(["T2DG0200013", "T2DG0200027", "T2DG0500309", "T2DG0701163", "T2DG0701156", "T2DG0800488", "T2DG0901234", "T2DG1000568", "T2DG1000570", "T2DG1101324", "T2DG1600768", "T2DG2701070", "T2DG4701118", "T2DG0200068", "T2DG0200071", "T2DG0400250", "T2DG0500349", "T2DG0500339", "T2DG0701179", "T2DG0800504", "T2DG0800564", "T2DG0901289", "T2DG0901275", "T2DG1000595", "T2DG1101348", "T2DG1600785", "T2DG1700875", "T2DG2701094", "T2DG4701124", "T2DG0200098", "T2DG0200076", "T2DG0200104", "T2DG0200065", "T2DG0400287", "T2DG0400260", "T2DG0400280", "T2DG0400288", "T2DG0400267", "T2DG0400269", "T2DG0400279", "T2DG0400256", "T2DG0400273", "T2DG0500370", "T2DG0500367", "T2DG0500353", "T2DG0500383", "T2DG0500362", "T2DG0500364", "T2DG0500379", "T2DG0500351", "T2DG0500381", "T2DG0500385", "T2DG0500357", "T2DG0701225", "T2DG0701227", "T2DG0701194", "T2DG0701196", "T2DG0701195", "T2DG0701211", "T2DG0701222", "T2DG0701220", "T2DG0701204", "T2DG0701208", "T2DG0701192", "T2DG0701191", "T2DG0701198", "T2DG0701181", "T2DG0701174", "T2DG0800561", "T2DG0800542", "T2DG0800563", "T2DG0800547", "T2DG0800559", "T2DG0901306", "T2DG0901296", "T2DG0901285", "T2DG0901284", "T2DG0901269", "T2DG0901295", "T2DG0901299", "T2DG0901279", "T2DG0901307", "T2DG1000637", "T2DG1000629", "T2DG1000630", "T2DG1000614", "T2DG1000612", "T2DG1000613", "T2DG1000631", "T2DG1000591", "T2DG1000620", "T2DG1000640", "T2DG1000606", "T2DG1000636", "T2DG1000638", "T2DG1000627", "T2DG1101369", "T2DG1101381", "T2DG1101383", "T2DG1101377", "T2DG1101354", "T2DG1101356", "T2DG1600793", "T2DG1600811", "T2DG1600816", "T2DG1600810", "T2DG1700872", "T2DG1700869", "T2DG1700876", "T2DG1700867", "T2DG2701096", "T2DG2701093", "T2DG4701139", "T2DG4701129", "T2DG0200073", "T2DG0200031", "T2DG0200040", "T2DG0200032", "T2DG0200042", "T2DG0200047", "T2DG0200070", "T2DG0200078", "T2DG0200096", "T2DG0200063", "T2DG0200077", "T2DG0200057", "T2DG0200086", "T2DG0200041", "T2DG0400234", "T2DG0400243", "T2DG0400257", "T2DG0400261", "T2DG0400264", "T2DG0400241", "T2DG0400237", "T2DG0400238", "T2DG0400258", "T2DG0400295", "T2DG0400254", "T2DG0400262", "T2DG0500334", "T2DG0500346", "T2DG0500358", "T2DG0500371", "T2DG0500389", "T2DG0500332", "T2DG0500352", "T2DG0500375", "T2DG0500347", "T2DG0500388", "T2DG0500373", "T2DG0701188", "T2DG0701219", "T2DG0701203", "T2DG0701199", "T2DG0701216", "T2DG0701217", "T2DG0701214", "T2DG0800541", "T2DG0800520", "T2DG0800552", "T2DG0800529", "T2DG0800514", "T2DG0800502", "T2DG0800505", "T2DG0800509", "T2DG0901305", "T2DG0901287", "T2DG0901267", "T2DG0901271", "T2DG0901270", "T2DG0901308", "T2DG0901288", "T2DG0901298", "T2DG0901278", "T2DG0901272", "T2DG0901263", "T2DG1000599", "T2DG1000618", "T2DG1000597", "T2DG1000611", "T2DG1000616", "T2DG1000592", "T2DG1000598", "T2DG1000604", "T2DG1000642", "T2DG1000639", "T2DG1000607", "T2DG1101365", "T2DG1101366", "T2DG1101382", "T2DG1101388", "T2DG1101389", "T2DG1101372", "T2DG1101384", "T2DG1101385", "T2DG1101338", "T2DG1101341", "T2DG1101343", "T2DG1101387", "T2DG1101390", "T2DG1101344", "T2DG1101342", "T2DG1600812", "T2DG1600819", "T2DG1600805", "T2DG1600804", "T2DG1600807", "T2DG1600799", "T2DG1700861", "T2DG1700846", "T2DG1700854", "T2DG1700853", "T2DG1700868", "T2DG1700870", "T2DG2701110", "T2DG2701085", "T2DG2701088", "T2DG2701111", "T2DG2701107", "T2DG2701091", "T2DG4701130", "T2DG4701122", "T2DG4701133", "T2DG4701127", "T2DG4701128", "T2DG0200006", "T2DG0200008", "T2DG0200012", "T2DG0200009", "T2DG0200018", "T2DG0200023", "T2DG0200007", "T2DG0400219", "T2DG0500318", "T2DG0500327", "T2DG0500310", "T2DG0500312", "T2DG0500313", "T2DG0701164", "T2DG0701143", "T2DG0800490", "T2DG0800498", "T2DG0901251", "T2DG1000567", "T2DG1000582", "T2DG1000586", "T2DG1000565", "T2DG1000566", "T2DG1000569", "T2DG1101320", "T2DG1101330", "T2DG1600767", "T2DG1600773", "T2DG1600778", "T2DG1600771", "T2DG1700824", "T2DG1700836", "T2DG2701073", "T2DG2701079"])}
    for v in variants.itervalues():
        performGroupCalculations(v,groupDict,"CONTROLS_IND_EXTREME",mode=-1)
        i += 1
        if i > nextTick:
            nextTick += tickInterval
            print ".",
    print ""
    
    
    del fileAttributes['INDIVIDUALS']
    del fileAttributes['INFO']['RSID']
    for k in groupDict.iterkeys():
        fileAttributes['INFO'][k] = {"ID":k,"Number":1,"Type":"Float","Description":"%s Allele frequency"%k}
    
    
    print "loading .csv file",
    acceptAllFilter = valueFilter()
    csvDict = {"SIFT_score":acceptAllFilter,
               "LRT_score":acceptAllFilter,
               "MutationTaster_pred":acceptAllFilter,
               "phyloP":acceptAllFilter,
               "SLR_test_statistic":acceptAllFilter,
               "LRT_pred":acceptAllFilter,
               "MutationTaster_score":acceptAllFilter,
               "MutationTaster_pred":acceptAllFilter,
               "GERP++_NR":acceptAllFilter,
               "GERP++_RS":acceptAllFilter,
               "phyloP":acceptAllFilter,
               "29way_logOdds":acceptAllFilter,
               "LRT_Omega":acceptAllFilter,
               "1000Gp1_AF":acceptAllFilter}
    for k in csvDict.iterkeys():
        fileAttributes['INFO'][k] = {"ID":k,"Number":1,"Type":"Float","Description":k}
    
    def tryRepair(v):
        if variants.has_key(v.genomePosition):
            for key,value in v.attributes.iteritems():
                variants[v.genomePosition].setAttribute(key,value)
    params = variantLoadingParameters(build=genomeUtils.hg19,
                                      passFunction=tryRepair,
                                      rejectFunction=None,
                                      callbackArgs={},
                                      tickFunction=tick,
                                      tickInterval=10,
                                      individualsToInclude=None,
                                      individualAppendString="",
                                      lociToInclude=None,
                                      mask=None,    #features.regions,
                                      invertMask=False, #True,
                                      attributesToInclude=csvDict,
                                      attributeAppendString="",
                                      skipGenotypeAttributes=True,
                                      returnFileObject=False,
                                      alleleMatching=allele.UNENFORCED,
                                      attemptRepairsWhenComparing=True)
    variantFile.parseCsvFile(csv,params)
    print ""
    
    print "writing file..."
    temp = variantFile(fileAttributes)
    temp.variants = variants.values()
    temp.writeVcfFile(out, sortMethod="NUMXYM", includeScriptLine=True)
    print ""
    print "done"
def runApp(loci="",l="",vcf="",v="",individuals="",i="",out="",o="",remove="",r=""):
    lociToKeep = set()

    def tick():
        print ".",
    
    def add(v):
        if lociToKeep != None:
            lociToKeep.add(v)
    
    if individuals != None:
        print "Parsing individual list..."
        individualList = []
        infile = open(individuals,'r')
        for line in infile:
            individualList.append(line.strip())
        infile.close()
        
    else:
        individualList = variantFile.extractVcfFileInfo(vcf)["INDIVIDUALS"]
    individualList.sort()
    
    if loci != None:
        print "Parsing loci list..."
        firstLine = True
        infile = open(loci,'r')
        for line in infile:
            if firstLine:
                firstLine = False
                continue
            columns = line.split()
            lociToKeep.add(variant(chromosome=columns[0], position=columns[1], matchMode=allele.UNENFORCED, attemptRepairsWhenComparing=True, ref=".*", alt=".*", name=columns[2], build=genomeUtils.hg19, attributeFilters=None))
        infile.close()
    else:
        lociToKeep = None
    
    if not isinstance(vcf,list):
        vcf = [vcf]
    for infile in vcf:
        print "Parsing %s" % infile
        # we only care about genotypes; we throw out all other details.
        parseParameters = variantLoadingParameters(  passFunction=add,
                                                     tickFunction=tick,
                                                     tickInterval=5,
                                                     individualsToInclude=individualList,
                                                     alleleMatching = allele.UNENFORCED,
                                                     lociToInclude=lociToKeep,
                                                     attributesToInclude={},
                                                     skipGenotypeAttributes=True)
        
        if lociToKeep == None:
            parseParameters.returnFileObject = True
        
        resultFile = variantFile.parseVcfFile(infile,parseParameters)
        print ""
    
    print "Writing results..."
    outfile = open(out,'w')
    outfile.write("Chromosome\tPosition\tRs#\tMajor\tMinor")
    for i in individualList:
        outfile.write("\t%s" % i)
    outfile.write("\n")
    
    if remove != None:
        removeFile = open(remove,'w')
        removeFile.write("Chromosome\tPosition\tRs#\tReason\n")
    
    if lociToKeep != None:
        lociToKeep = sorted(lociToKeep, key=lambda v: v.name)
    else:
        lociToKeep = sorted(resultFile.variants, key=lambda v: v.name)
    
    for v in lociToKeep:
        if len(v.genotypes) == 0:
            if remove != None:
                removeFile.write("%s\t%i\t%s\tNo Genotypes\n" % (v.chromosome,v.position,v.name))
        elif len(v.alleles) < 2:
            if remove != None:
                removeFile.write("%s\t%i\t%s\tNot enough alleles" % (v.chromosome,v.position,v.name))
        else:
            outfile.write("%s\t%i\t%s\t%s\t%s" % (v.chromosome,v.position,v.name,v.alleles[0],v.alleles[1]))
            for i in individualList:
                a1 = "_"
                a2 = "_"
                if v.genotypes.has_key(i):
                    g = v.genotypes[i]
                    if g.allele1 != None:
                        a1 = str(g.allele1)
                    if g.allele2 != None:
                        a2 = str(g.allele2)
                outfile.write("\t%s%s" % (a1,a2))
            outfile.write("\n")
    outfile.close()
    if remove != None:
        removeFile.close()
    print "Done."