def execute(self, executable): """The method execute checks which program has to be executed and executes this program :param executable: the argument of the commandline which determines which program has to be executed :type executable: str """ if executable == "haplotyping": if Grid.useGrid == True: Haplotyper.executeBeagleCluster(self.pool) else: Haplotyper.executeBeagleMultiThread(self.pool) elif executable == "snvCalling": if Program.config.snvCaller == "samtools": # @UndefinedVariable SamtoolsMpileup.executeSamtoolsMultiThreaded(self.pool) elif Program.config.snvCaller == "GATK": # @UndefinedVariable Gatk.Gatk(self.pool).callSnvs() elif executable == "mapping": mapper = Mapper.Mapper() for sample in self.samples: mapper.map(sample) elif executable == "allelicDiversity": if Program.config.gffFile == None: # @UndefinedVariable print("When calculating the allelic diversity, a gff file is needed, this option can be set with the option --gff <file>") exit() allelicDiverityCalculator = AllelicDiversity.AllelicDiversity(self.pool, Program.config.gffFile) # @UndefinedVariable allelicDiverityCalculator.getAllelicDiversity() elif executable == "findLoci": if Program.config.phenoData == None: # @UndefinedVariable print("When finding loci, a csv file is needed with the phenotype data, this option can be set with the option --phen <file>") exit() if Program.config.gffFile == None: # @UndefinedVariable print("When finding loci, a file with phenotype data is needed, this option can be set with the option --gff <file>") exit() lociFinder = LociFinder.LociFinder() lociFinder.findLoci(self.pool)
def findLoci(self, pool): #read the input files Haplotyper.executeBeagleMultiThread(pool) phenReader = Readers.PhenotypeReader() Program.config.phenoData = self.convertExcelToCsv( Program.config.phenoData, pool.outputDir) phenReader.readFile(Program.config.phenoData) converter = Readers.AccessionConverter() converter.readFile( os.path.dirname(os.path.realpath(__file__)) + "/convertToAccession.txt") for phenotype in phenReader.phenotypes: deletedKeys = 0 # for oldKey in phenotype.alleles.keys(): # try: # newKey = converter.getAccession(oldKey) # phenotype.alleles[newKey] = phenotype.alleles.pop(oldKey) # except KeyError: # deletedKeys += 1 # del phenotype.alleles[oldKey] for (chrom, vcfFile) in pool.vcf.items(): gffReader = Readers.GffReader(chrom=chrom) gffReader.readFile( Program.config.gffFile) # @UndefinedVariable phenotype.contigs = gffReader.contigs vcfReader = Readers.VcfReader(phenotype.contigs.values()) vcfReader.readFile(vcfFile.getFile()) pVals = self.findLociInPheno(phenotype) self.writePvaluesToFile(pVals, chrom, pool, phenotype.description)
def testExecuteBeagleGrid(self): expOutFile = "../testFiles/output/testPool/SL2.40ch11_22900-24100_testFiltered.vcf" Haplotyper.executeBeagleCluster(TestHaplotyper.testPool) createdOutFile = TestHaplotyper.testPool.vcf[TestHaplotyper.chrIndex].fileName self.assertEqual(os.path.abspath(createdOutFile),os.path.abspath(expOutFile) , os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) #Check if the file contains exactly one snp self.checkNoOfSnps(expOutFile)
def findLoci(self, pool): #read the input files Haplotyper.executeBeagleMultiThread(pool) phenReader = Readers.PhenotypeReader() Program.config.phenoData = self.convertExcelToCsv(Program.config.phenoData, pool.outputDir) phenReader.readFile(Program.config.phenoData) converter = Readers.AccessionConverter() converter.readFile(os.path.dirname(os.path.realpath(__file__)) + "/convertToAccession.txt") for phenotype in phenReader.phenotypes: deletedKeys = 0 # for oldKey in phenotype.alleles.keys(): # try: # newKey = converter.getAccession(oldKey) # phenotype.alleles[newKey] = phenotype.alleles.pop(oldKey) # except KeyError: # deletedKeys += 1 # del phenotype.alleles[oldKey] for (chrom, vcfFile) in pool.vcf.items(): gffReader = Readers.GffReader(chrom=chrom) gffReader.readFile(Program.config.gffFile) # @UndefinedVariable phenotype.contigs = gffReader.contigs vcfReader = Readers.VcfReader(phenotype.contigs.values()) vcfReader.readFile(vcfFile.getFile()) pVals = self.findLociInPheno(phenotype) self.writePvaluesToFile(pVals, chrom, pool, phenotype.description)
def getAllelicDiversity(self): """The method getAllelicDiversity calculates the allelic diversity and writes the output to a file. """ if Grid.useGrid == True: Haplotyper.executeBeagleCluster(self.pool) else: Haplotyper.executeBeagleMultiThread(self.pool) for vcf in self.pool.vcf: if vcf == None: logging.info("Starting to calculate the allelic diversity") outputFile = self.pool.outputDir + "/allelicDiversity.csv" else: logging.info("calculating allelic diversity of " + vcf) outputFile = self.pool.outputDir + "/" + vcf + "_" + "allelicDiversity.csv" try: self.vcfFile = self.pool.vcf[vcf].getFile() self._parseFiles(vcf) haplotypes = self._getAllHaplotypesByAccession(self.allContigs) accessions = haplotypes.values()[0].keys() with open(outputFile, "w") as outWriter: outWriter.write("contig\toriginal\t") for accession in accessions: outWriter.write(accession + "_1\t" + accession + "_2\t") outWriter.write("\n") for contigId in self.allContigs: outWriter.write(contigId + "\t") try: outWriter.write( self.allContigs[contigId].refHaplotype + "\t") except AttributeError: outWriter.write("-\t") for accession in accessions: for i in range(2): if contigId in haplotypes: outWriter.write( haplotypes[contigId][accession][i] + "\t") else: outWriter.write("-\t") outWriter.write("\n") except IndexError: if vcf == None: logging.warning("No SNPs within contigs found") else: logging.warning("No SNPs within contigs found of " + vcf) except Exception as ex: if vcf == None: logging.error( "an error occured during parsing the vcf file") else: logging.error("an error occured during parsing " + vcf) logging.error(ex) traceback.print_exc()
def testHaplotyperPathGrid(self): TestHaplotyper.testPool.vcf ={} TestHaplotyper.sample = Sample.Sample(TestHaplotyper.testPool, "testLib") TestHaplotyper.testPool.addSample(TestHaplotyper.sample) TestHaplotyper.sample.bam = BamFile.BamFile(TestHaplotyper.testPool, TestHaplotyper.sample, TestHaplotyper.inputBam, sortedBam = True, headerLine = True, duplicates = False, mdTag = True, index = True) Haplotyper.executeBeagleCluster(TestHaplotyper.testPool) # self.assertEqual(os.path.abspath(createdOutFile),os.path.abspath(expOutFile) , os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) self.checkNoOfSnps("../testFiles/output/testPool/SL2.40ch11_22900-24100_testPool_SL2.40ch11_22900-24100.vcf")
def testExecuteBeagleMultiThread(self): expOutFile = "../testFiles/output/testPool/SL2.40ch11_22900-24100_testFiltered.vcf" Haplotyper.executeBeagleMultiThread(TestHaplotyper.testPool) createdOutFile = TestHaplotyper.testPool.vcf[ TestHaplotyper.chrIndex].fileName self.assertEqual( os.path.abspath(createdOutFile), os.path.abspath(expOutFile), os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) self.checkNoOfSnps(expOutFile)
def testExecuteBeagleGrid(self): expOutFile = "../testFiles/output/testPool/SL2.40ch11_22900-24100_testFiltered.vcf" Haplotyper.executeBeagleCluster(TestHaplotyper.testPool) createdOutFile = TestHaplotyper.testPool.vcf[ TestHaplotyper.chrIndex].fileName self.assertEqual( os.path.abspath(createdOutFile), os.path.abspath(expOutFile), os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) #Check if the file contains exactly one snp self.checkNoOfSnps(expOutFile)
def getAllelicDiversity(self): """The method getAllelicDiversity calculates the allelic diversity and writes the output to a file. """ if Grid.useGrid == True: Haplotyper.executeBeagleCluster(self.pool) else: Haplotyper.executeBeagleMultiThread(self.pool) for vcf in self.pool.vcf: if vcf == None: logging.info("Starting to calculate the allelic diversity") outputFile = self.pool.outputDir + "/allelicDiversity.csv" else: logging.info("calculating allelic diversity of " + vcf) outputFile = self.pool.outputDir + "/"+vcf + "_" + "allelicDiversity.csv" try: self.vcfFile = self.pool.vcf[vcf].getFile() self._parseFiles(vcf) haplotypes = self._getAllHaplotypesByAccession(self.allContigs) accessions = haplotypes.values()[0].keys() with open(outputFile, "w") as outWriter: outWriter.write("contig\toriginal\t") for accession in accessions: outWriter.write( accession + "_1\t" + accession + "_2\t") outWriter.write("\n") for contigId in self.allContigs: outWriter.write(contigId + "\t") try: outWriter.write(self.allContigs[contigId].refHaplotype + "\t") except AttributeError: outWriter.write("-\t") for accession in accessions: for i in range(2): if contigId in haplotypes: outWriter.write(haplotypes[contigId][accession][i] + "\t") else: outWriter.write("-\t") outWriter.write("\n") except IndexError: if vcf == None: logging.warning("No SNPs within contigs found") else: logging.warning("No SNPs within contigs found of " + vcf) except Exception as ex: if vcf == None: logging.error("an error occured during parsing the vcf file") else: logging.error("an error occured during parsing " + vcf) logging.error(ex) traceback.print_exc()
def testHaplotyperFullPathGrid(self): expOutFile = "../testFiles/output/testPool/SL2.40ch11_22900-24100_testPool_SL2.40ch11_22900-24100.vcf" gzFile = "../testFiles/input/test.fq.gz" refGzFile = "../testFiles/input/revTest.fq.gz" TestHaplotyper.testPool.vcf ={} TestHaplotyper.sample = Sample.Sample(TestHaplotyper.testPool, "testLib") TestHaplotyper.testPool.addSample(TestHaplotyper.sample) TestHaplotyper.sample.setForwardFq(gzFile) TestHaplotyper.sample.setReversedFq(refGzFile) TestHaplotyper.sample.reversedFq.forward = False Haplotyper.executeBeagleCluster(TestHaplotyper.testPool) # createdOutFile = TestHaplotyper.testPool.vcf[TestHaplotyper.chrIndex].fileName # self.assertEqual(os.path.abspath(createdOutFile),os.path.abspath(expOutFile) , os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) #Check if the file contains exactly one snp self.checkNoOfSnps(expOutFile)
def testHaplotyperFullPathGrid(self): expOutFile = "../testFiles/output/testPool/SL2.40ch11_22900-24100_testPool_SL2.40ch11_22900-24100.vcf" gzFile = "../testFiles/input/test.fq.gz" refGzFile = "../testFiles/input/revTest.fq.gz" TestHaplotyper.testPool.vcf = {} TestHaplotyper.sample = Sample.Sample(TestHaplotyper.testPool, "testLib") TestHaplotyper.testPool.addSample(TestHaplotyper.sample) TestHaplotyper.sample.setForwardFq(gzFile) TestHaplotyper.sample.setReversedFq(refGzFile) TestHaplotyper.sample.reversedFq.forward = False Haplotyper.executeBeagleCluster(TestHaplotyper.testPool) # createdOutFile = TestHaplotyper.testPool.vcf[TestHaplotyper.chrIndex].fileName # self.assertEqual(os.path.abspath(createdOutFile),os.path.abspath(expOutFile) , os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) #Check if the file contains exactly one snp self.checkNoOfSnps(expOutFile)
def testHaplotyperPathGrid(self): TestHaplotyper.testPool.vcf = {} TestHaplotyper.sample = Sample.Sample(TestHaplotyper.testPool, "testLib") TestHaplotyper.testPool.addSample(TestHaplotyper.sample) TestHaplotyper.sample.bam = BamFile.BamFile(TestHaplotyper.testPool, TestHaplotyper.sample, TestHaplotyper.inputBam, sortedBam=True, headerLine=True, duplicates=False, mdTag=True, index=True) Haplotyper.executeBeagleCluster(TestHaplotyper.testPool) # self.assertEqual(os.path.abspath(createdOutFile),os.path.abspath(expOutFile) , os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) self.checkNoOfSnps( "../testFiles/output/testPool/SL2.40ch11_22900-24100_testPool_SL2.40ch11_22900-24100.vcf" )
def execute(self, executable): """The method execute checks which program has to be executed and executes this program :param executable: the argument of the commandline which determines which program has to be executed :type executable: str """ if executable == "haplotyping": if Grid.useGrid == True: Haplotyper.executeBeagleCluster(self.pool) else: Haplotyper.executeBeagleMultiThread(self.pool) elif executable == "snvCalling": if Program.config.snvCaller == "samtools": # @UndefinedVariable SamtoolsMpileup.executeSamtoolsMultiThreaded(self.pool) elif Program.config.snvCaller == "GATK": # @UndefinedVariable Gatk.Gatk(self.pool).callSnvs() elif executable == "mapping": mapper = Mapper.Mapper() for sample in self.samples: mapper.map(sample) elif executable == "allelicDiversity": if Program.config.gffFile == None: # @UndefinedVariable print( "When calculating the allelic diversity, a gff file is needed, this option can be set with the option --gff <file>" ) exit() allelicDiverityCalculator = AllelicDiversity.AllelicDiversity( self.pool, Program.config.gffFile) # @UndefinedVariable allelicDiverityCalculator.getAllelicDiversity() elif executable == "findLoci": if Program.config.phenoData == None: # @UndefinedVariable print( "When finding loci, a csv file is needed with the phenotype data, this option can be set with the option --phen <file>" ) exit() if Program.config.gffFile == None: # @UndefinedVariable print( "When finding loci, a file with phenotype data is needed, this option can be set with the option --gff <file>" ) exit() lociFinder = LociFinder.LociFinder() lociFinder.findLoci(self.pool)
def setUp(self): for delFile in os.listdir("../testFiles/output/"): file_path = os.path.join("../testFiles/output/", delFile) if os.path.isdir(file_path): shutil.rmtree(file_path) else: os.unlink(file_path) TestHaplotyper.testPool = Pool.Pool("testPool", "../testFiles/output/") Program.config.setPath("refGenome", "../testFiles/input/smallRefGenome.fa") TestHaplotyper.testPool.vcf[TestHaplotyper.chrIndex] = VcfFile.VcfFile( TestHaplotyper.testPool, TestHaplotyper.inputVcf, bcf=False, chrom=TestHaplotyper.chrIndex) TestHaplotyper.haplotyper = Haplotyper.Haplotyper( TestHaplotyper.testPool, TestHaplotyper.chrIndex)
def testExecuteBeagleMultiThread(self): expOutFile = "../testFiles/output/testPool/SL2.40ch11_22900-24100_testFiltered.vcf" Haplotyper.executeBeagleMultiThread(TestHaplotyper.testPool) createdOutFile = TestHaplotyper.testPool.vcf[TestHaplotyper.chrIndex].fileName self.assertEqual(os.path.abspath(createdOutFile),os.path.abspath(expOutFile) , os.path.abspath(createdOutFile) + " not is " + os.path.abspath(expOutFile)) self.checkNoOfSnps(expOutFile)