Esempio n. 1
0
 def preprocess(self, lib):
     """
     The method preprocess determines which preprocessing steps have to be executed for a given library.
     """
     logging.info("Preprocessing of " + lib.libName)
     LaTeX.ltxSection("Preprocessing of " + lib.libName)
     lib.forward = DirUtils.fileRegexToList(lib.forward)
     lib.reversed = DirUtils.fileRegexToList(lib.reversed)
     if lib.format == "sff":
         for idx, sffFile in enumerate(lib.forward):
             lib.forward[idx] = SffToFastqConverter.SffToFastqConverter(lib.outputDir, sffFile=sffFile).execute()
     smallReport = FastqSmallReport.FastqSmallReport()
     smallReport.createSmallReport(lib.forward, lib.reversed) 
      
     if len(lib.forward)  > 1:     
         lib.forward = FastqCommands.MergeCommand(lib.outputDir, direction="forward", fastqFiles=lib.forward).execute()
         if lib.reversed != None:
             lib.reversed = FastqCommands.MergeCommand(lib.outputDir, direction="reversed", fastqFiles=lib.forward).execute()
         FastqSmallReport.FastqSmallReport().createSmallReport(lib.forward, lib.reversed)
     else:
         lib.forward = lib.forward[0]
         if lib.reversed != None:
             lib.reversed = lib.reversed[0]
     lib.avgReadlength = float(smallReport.fastqInfo[smallReport.fastqInfo.keys()[0]][2])
     
     if lib.sequencingPlatform == "illumina":
         self.illuminaPreprocess(lib)
     elif lib.sequencingPlatform == "454":
         lib.forward = FastqMcfTrimming.FastqTrimmer(lib.outputDir, forward=lib.forward,noTrim=True).execute()
         FastqSmallReport.FastqSmallReport().createSmallReport(lib.forward, lib.reversed)
     
     self.filterContamination(lib) 
Esempio n. 2
0
 def createStatistics(self, pool):
     """
     The method createStatistics creates an assemblystatistics object for generating the statistics of the assembly and creates a fastq report for the raw and fully preprocessed data.
     """
     logging.info("Creating assembly stats")
     assemblyController = AssemblyControl.AssemblyStatistics()
     assemblyController.AssemblyStatisticsOfPipeline(pool.outputDir + "statistics/", pool, self.assembly)
     LaTeX.ltxPart("Supplementary materials")
     for lib in pool.libs:
         self.createFastqReport("raw " + lib.libName, lib.rawForward, lib.rawReversed, lib.outputDir + "raw_qc/")
         self.createFastqReport("preprocessed " + lib.libName, lib.forward, lib.reversed, lib.outputDir + "preprocessed/")
     
     Reporter.instance.createReport(pool.outputDir + "report/")
 def getLaTeXReport(self):
     table = LaTeX.ltxTable(2)
     table.addRow(["Total bp: ","{:,}".format(self.totalBp)])
     table.addRow(["Peak: ",str(self.peak)])
     table.addRow(["Mean base coverage: ", "{:.2f}".format(self.coverage)])
     table.addRow(["",""])
     table.addRow(["Unique good kmers: ","{:,}".format(self.unique_gkmers)])
     table.addRow(["BGI genome size estimation: ", "{:,}".format(int(round(self.bgi)))])
     table.addRow(["GSE kmers/peak: ","{:,}".format(self.kmersPerPeak)])
     
     tex = table.getText()
     
     img = LaTeX.ltxImage(self.genSizeHistoPlot)
     tex = tex + img.getText()
     return tex
Esempio n. 4
0
    def getLaTeXReport(self):
        txt = "\\section{Fastqc of " +self.status.replace("_"," ")+"}\n"
        table = False
        imgs = 0
        with open(self.outputFile) as reportReader:
            for line in reportReader:
                if "</h2>" in line and "Summary" not in line:
                    txt = txt + "\\subsection*{"+re.findall(r"]\">(.*?)</h2>",line)[0]+"}\n"
#                     if "Basic Statistics" in line:
#                         smallReport = FastqSmallReport.FastqSmallReport()
#                         smallReport.createSmallReport([self.forward], None)
#                         txt = txt + smallReport.getLaTeXReport()
#                         txt = txt + "\\\\"
                elif "<table>" in line:
                    xml = "<table>"
                    table = True
                elif "</table>" in line:
                    xml = xml + "</table>"
                    table = False
                    domTable = minidom.parseString(xml)
                    ltxTable = LaTeX.ltxTable(len(domTable.firstChild.firstChild.childNodes))
                    noOfRows = 0
                    for row in domTable.firstChild.childNodes:
                        cols = []
                        noOfRows = noOfRows + 1
                        if noOfRows > 15:
                            continue
                        for col in row.childNodes:
                            cols.append(col.firstChild.nodeValue.replace("%"," percent"))
                        if "Filename" in cols[0]:
                                continue
                        ltxTable.addRow(cols)
                    txt = txt + ltxTable.getText()
                    if noOfRows > 15:
                        txt = txt + "\\\\Total length of this table is "+ str(noOfRows) + ". The table is cut after 15 rows..."
                elif table == True:
                    xml = xml + line.strip()
                elif "<img class=\"indented\"" in line:
                    imgs = imgs +1
                    img = re.findall("src=\"(.*)\" alt=",line)[0]
                    ltxImg = LaTeX.ltxImage(os.path.dirname(self.outputFile) + "/" + img)
                    txt = txt + ltxImg.getText()
                    if imgs % 2 == 0:
                        txt = txt + "\\clearpage\n"
        return txt
Esempio n. 5
0
 def doAssembly(self, pool):
     """
     The method doAssembly creates all objects to execute a wgs assembly. Afterwards the insert sizes of all pe and
     mp libraries are estimated.
     """
     logging.info("Executing assembly")
     LaTeX.ltxSection("Assembly")
     if Configuration.instance.getGlobalOption("assembler") == None or Configuration.instance.getGlobalOption("assembler") == "wgs":
         assembler = WgsAssembler.WgsAssembler()
         self.assembly = assembler.doAssembly(pool.outputDir + "assembly/", pool)
     elif Configuration.instance.getGlobalOption("assembler") == "allpaths":
         self.assembly = AllpathsAssembler.AllpathsAssembler().doAssembly(pool.outputDir + "allPathsAssembly/", pool)
         
     for lib in pool.libs:
         if lib.reversed == None:
             continue
         logging.info("Calculating insert sizes for " + lib.libName)
         insertSizeChecker = InsertSizeChecker.InsertSizeChecker()
         insertSizeChecker.checkInsertSize(lib.outputDir, lib.rawForward, lib.rawReversed, self.assembly, lib.libName, lib.insertSize)
 def doGenomeSizeEstimation(self, outputDir, pool):
     """
     The method doGenomeSizeEstimation contains the mainflow of the genomesize estimation. This mainflow contains the following methods:
     * Execute Jellyfish count
     * Execute Jellyfish stats
     * Create a histogram of the unique kmers with Jellyfish histo
     * Draw a histogram of the unique kmers
     * Estimate the genome size with the BGI method
     """
     logging.info("Starting genome size estimation")
     if not os.path.isdir(outputDir):
         os.makedirs(outputDir)
         
     LaTeX.ltxSection("Genome size estimation}")
     self.jellyFishCountsFile = JellyFish.JellyFishCount(outputDir, pool=pool).execute()
     self.jellyFishStatsFile = JellyFish.JellyFishStats(outputDir, jellyFishCountsFile=self.jellyFishCountsFile).execute()
     self.jellyfishHistoFile = JellyFish.JellyFishHisto(outputDir, jellyFishCountsFile=self.jellyFishCountsFile).execute()
     self.genSizeHistoPlot = outputDir + "kmer_graph.png"
     self.peak = int(self.drawHisto(self.jellyfishHistoFile, self.genSizeHistoPlot))
     self.calculateGenomeSize(pool, self.jellyFishStatsFile, self.jellyfishHistoFile)
     Reporter.instance.objects.append(self)
 def getLaTeXReport(self):
     txt = ""
     table = LaTeX.ltxTable(len(self.fastqInfo.values()[0]) + 1)
     table.addRow(
         [
             "Fastq file",
             "Total bases",
             "number of reads",
             "AVG read length",
             "Percentage high quality bases ($qv > 30$)",
         ]
     )
     for [index, fastqEntry] in self.fastqInfo.iteritems():
         table.addRow([index] + fastqEntry)
     txt = txt + table.getText()
     return txt + "\\\\"
 def getLaTeXReport(self):
     """
     Convert all previously calculated statistics into LaTeX with this method.
     """
     txt = "\\subsection{Statistics}\n"
     table = LaTeX.ltxTable(2)
     table.addRow(["Total sequences: ",str(self.totalSeqs)])
     table.addRow(["Total length: ","{:,}".format(self.totalLen)])
     table.addRow(["GC perc: ","{:.2f}".format(self.gcPerc) + "\%"])
     table.addRow(["Longest sequence: ","{:,}".format(self.longestSeq)])
     table.addRow(["N50 index: ","{:,}".format(self.n50Index)])
     table.addRow(["N50: ","{:,}".format(self.n50)])
     table.addRow(["",""])
     table.addRow(["N90 index: ","{:,}".format(self.n90Index)])
     table.addRow(["N90: ","{:,}".format(self.n90)])
     if hasattr(self, "cegmaScore"):
         table.addRow(["",""])
         table.addRow(["Cegma complete: ",self.cegmaScore[0] + "\%"])
         table.addRow(["Cegma partial: ",self.cegmaScore[1] + "\%"])
     for name, value in self.otherCegmaScores.iteritems():
         table.addRow(["",""])
         print value[0]
         table.addRow([name + " complete: ",value[0] + "\%"])
         table.addRow([name + " partial: ",value[1] + "\%"])
         
     if hasattr(self, "rawDnaMappingStats"):
         table.addRow(["",""])
         table.addRow(["DNA reads: ","{:,}".format(int(self.rawDnaMappingStats["total"]))])
         table.addRow(["Mapped: ",self.rawDnaMappingStats["mapped"] + "\%"])
         if "propPair" in self.rawDnaMappingStats:
             table.addRow(["Properly paired",self.rawDnaMappingStats["propPair"] + "\%"])
         table.addRow(["Error rate: ","{:.2f}".format(self.errorRate) + " SNPs per 10kb"])
         table.addRow(["SNP density: ","{:.2f}".format(self.snpDensity) + " SNPs per 10kb"])
     if hasattr(self, "rnaMappingStats"):
         table.addRow(["",""])
         table.addRow(["RNA reads: ","{:,}".format(int(self.rnaMappingStats["total"]))])
         table.addRow(["Mapped: ",self.rnaMappingStats["mapped"] + "\%"])
         if "propPair" in self.rnaMappingStats:
             table.addRow(["Properly paired: ",self.rnaMappingStats["propPair"] + "\%"])
         
     txt = txt + table.getText()
     txt = txt + "\\begin{figure}[h]\n"
     txt = txt + "\\includegraphics[scale=0.7]{" + self.a50Plot + "}\n"
     txt = txt + "\\end{figure}\n"
     return txt