def find_analyses(target, recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir): """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets""" files = {"template":[], "complement":[]} logger.info("Finding template analyses") for fastqFile in templateFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "template_" + name) files["template"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start : ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget(Target.makeTargetFn(analyze, args=analysis)) logger.info("Finding complement analyses") for fastqFile in complementFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "complement_" + name) files["complement"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start : ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget(Target.makeTargetFn(analyze, args=analysis)) target.setFollowOnTargetFn(merge, args=(files, outputDir))
def find_analyses(target, recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir): """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets""" files = {"template": [], "complement": []} logger.info("Finding template analyses") for fastqFile in templateFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "template_" + name) files["template"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start:ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget( Target.makeTargetFn(analyze, args=analysis)) logger.info("Finding complement analyses") for fastqFile in complementFastqFiles: for name, seq, qual in fastqRead(fastqFile): if name in recordsToAnalyze: outfile = os.path.join(target.getGlobalTempDir(), "complement_" + name) files["complement"].append(outfile) ref_name, ref_start, ref_stop = recordsToAnalyze[name] ref_seq = references[ref_name][ref_start:ref_stop] analysis = [name, seq, ref_name, ref_seq, outfile] target.addChildTarget( Target.makeTargetFn(analyze, args=analysis)) target.setFollowOnTargetFn(merge, args=(files, outputDir))
def getFastqDictionary(fastqFile): """Returns a dictionary of the first words of fastq headers to their corresponding fastq sequence """ names = map(lambda x: x[0].split()[0], fastqRead(open(fastqFile, 'r'))) assert len(names) == len(set(names)) #Check all the names are unique return dict( map(lambda x: (x[0].split()[0], x[1]), fastqRead(open(fastqFile, 'r')))) #Hash of names to sequences
def __init__(self, outputDir, experiments): AbstractMetaAnalysis.__init__(self, outputDir, experiments) allReads = {(name, readFastqFile, readType, seq) for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir \ in self.experiments for name, seq, qual in fastqRead(readFastqFile)} mappedReads = dict() for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir in self.experiments: for record in samIterator( pysam.Samfile(os.path.join(resultsDir, "mapping.sam"))): if not record.is_unmapped: if (record.qname, readFastqFile) not in mappedReads: mappedReads[(record.qname, readFastqFile)] = set() mappedReads[(record.qname, readFastqFile)].add( (mapper.__name__, referenceFastaFile)) self.reads = list() for name, readFastqFile, readType, seq in allReads: if (name, readFastqFile) in mappedReads: mappers, referenceFastaFiles = map( tuple, zip(*mappedReads[(name, readFastqFile)])) self.reads.append( Read(name, seq, readType, readFastqFile, (mappers, referenceFastaFiles))) else: self.reads.append( Read(name, seq, readType, readFastqFile, None))
def fastq_read_size(fastq_path, num_reads=10000): sizes = [] fastq_handle = fastqRead(gzip.open(fastq_path)) for i in xrange(num_reads): name, seq, qual = fastq_handle.next() sizes.append(len(seq)) return 1.0 * sum(sizes) / len(sizes)
def getFastqDictionary(fastqFile): """Returns a dictionary of the first words of fastq headers to their corresponding fastq sequence """ namesAndSequences = map(lambda x : (x[0].split()[0], x[1]), fastqRead(open(fastqFile, 'r'))) names = map(lambda x : x[0], namesAndSequences) assert len(names) == len(set(names)) #Check all the names are unique return dict(namesAndSequences) #Hash of names to sequences
def normaliseQualValues(inputFastqFile, outputFastqFile): """Makes a fastq with valid qual values """ fileHandle = open(outputFastqFile, 'w') for name, seq, quals in fastqRead(open(inputFastqFile, 'r')): if quals == None: quals = [33] * len(seq) fastqWrite(fileHandle, name, seq, quals) fileHandle.close() return outputFastqFile
def run(self): refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences readSequences = readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r" ) overallIndelCounter = IndelCounter("overall", "overall") for aR in sam: #Iterate on the sam lines refSeq = refSequences[sam.getrname(aR.rname)] readSeq = readSequences[aR.qname] overallIndelCounter.addReadAlignment(aR, refSeq, readSeq) sam.close() #Write out the substitution info open(os.path.join(self.outputDir, "indels.xml"), 'w').write(prettyXml(overallIndelCounter.getXML()))
def makeFastqSequenceNamesUnique(inputFastqFile, outputFastqFile): """Makes a fastq file with unique names """ names = set() fileHandle = open(outputFastqFile, 'w') for name, seq, quals in fastqRead(open(inputFastqFile, 'r')): name = name.split()[0] #Get rid of any white space while name in names: logger.critical("Got a duplicate fastq sequence name: %s" % name) name += "i" names.add(name) fastqWrite(fileHandle, name, seq, quals) fileHandle.close() return outputFastqFile
def run(self): refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences overallCoverageCounter = CoverageCounter("overall", "overall") #Thing to store the overall coverage in readCoverages = [] sam = pysam.Samfile(self.samFile, "r" ) for aR in sam: #Iterate on the sam lines refSeq = refSequences[sam.getrname(aR.rname)] readSeq = readSequences[aR.qname] overallCoverageCounter.addReadAlignment(aR, refSeq, readSeq) readCoverages.append(CoverageCounter(aR.qname, sam.getrname(aR.rname))) readCoverages[-1].addReadAlignment(aR, refSeq, readSeq) sam.close() #Write out the coverage info parentNode = overallCoverageCounter.getXML() for readCoverage in readCoverages: parentNode.append(readCoverage.getXML()) open(os.path.join(self.outputDir, "coverages.xml"), 'w').write(prettyXml(parentNode))
def countKmers(self): refKmers, readKmers = Counter(), Counter() for name, seq in fastaRead(self.referenceFastaFile): for i in xrange(self.kmerSize, len(seq)): s = seq[ i - self.kmerSize : i ] if "N" not in s: refKmers[s] += 1 refKmers[reverseComplement(s)] += 1 for name, seq, qual in fastqRead(self.readFastqFile): for i in xrange(self.kmerSize, len(seq)): s = seq[ i - self.kmerSize : i ] if "N" not in s: readKmers[s] += 1 readKmers[reverseComplement(s)] += 1 return (refKmers, readKmers)
def __init__(self, outputDir, experiments): AbstractMetaAnalysis.__init__(self, outputDir, experiments) allReads = {(name, readFastqFile, readType, seq) for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir \ in self.experiments for name, seq, qual in fastqRead(readFastqFile)} mappedReads = dict() for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir in self.experiments: for record in samIterator(pysam.Samfile(os.path.join(resultsDir, "mapping.sam"))): if not record.is_unmapped: if (record.qname, readFastqFile) not in mappedReads: mappedReads[(record.qname, readFastqFile)] = set() mappedReads[(record.qname, readFastqFile)].add((mapper.__name__, referenceFastaFile)) self.reads = list() for name, readFastqFile, readType, seq in allReads: if (name, readFastqFile) in mappedReads: mappers, referenceFastaFiles = map(tuple, zip(*mappedReads[(name, readFastqFile)])) self.reads.append(Read(name, seq, readType, readFastqFile, (mappers, referenceFastaFiles))) else: self.reads.append(Read(name, seq, readType, readFastqFile, None))
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "blast_combined/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") #find all read fastq files, load into a dict by read type readFastqFiles = dict() for readType in readTypes: readFastqFiles[readType] = [os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir(os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq")] #find all reference fasta files referenceFastaFiles = [x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa")] #find all sam files that were analyzed using combinedAnalyses samFiles = {} for readType in readTypes: samFiles[readType] = [(readFastqFile, os.path.join("../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product(readFastqFiles[readType], referenceFastaFiles, combinedAnalyses)] mappedByReadType = defaultdict(set) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped} mappedByReadType[readType] = mappedByReadType[readType].union(mappedNames) unmappedByReadType = defaultdict(dict) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) for name, seq, qual in fastqRead(readFastqFileFullPath): name = name.split(" ")[0] if (name, readFastqFile) not in mappedByReadType[readType]: unmappedByReadType[readType][(name, readFastqFile)] = seq i = Stack(Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i)) for readType in readTypes: #build a counter of blast hits and set of read names that did not map blast_hits, no_hits = Counter(), set() for query, result in parse_blast(open(os.path.join(outputDir, readType + "_blast_out.txt"))): if result is None: no_hits.add(query) else: blast_hits[tuple(result)] += 1 #count number of times each hit was seen #write the unmapped hits to a fasta file outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w") for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems(): if name in no_hits: outf.write(">{}\n{}\n".format(name, seq)) outf.close() #write the blast report blast_out = open(os.path.join(outputDir, readType + "_blast_report.txt"), "w") blast_out.write("gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output for result, count in sorted(blast_hits.items(), key = lambda x: -int(x[-1])): blast_out.write("{}\t{}\n".format("\t".join(result), count)) blast_out.close() #calculate percents and make a barplot blast_count = sum(blast_hits.values()) unmapped_count = len(unmappedByReadType[readType]) - sum(blast_hits.values()) mapped_count = len(mappedByReadType[readType]) #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) outf = open(os.path.join(outputDir, readType + "percents.txt"),"w") outf.write("\n".join(map(str,[blast_count, unmapped_count, mapped_count]))) outf.close() #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf"))) system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "blast_combined/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") #find all read fastq files, load into a dict by read type readFastqFiles = dict() for readType in readTypes: readFastqFiles[readType] = [ os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir( os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq") ] #find all reference fasta files referenceFastaFiles = [ x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa") ] #find all sam files that were analyzed using combinedAnalyses samFiles = {} for readType in readTypes: samFiles[readType] = [ (readFastqFile, os.path.join( "../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product( readFastqFiles[readType], referenceFastaFiles, combinedAnalyses) ] mappedByReadType = defaultdict(set) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped} mappedByReadType[readType] = mappedByReadType[readType].union( mappedNames) unmappedByReadType = defaultdict(dict) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) for name, seq, qual in fastqRead(readFastqFileFullPath): name = name.split(" ")[0] if (name, readFastqFile) not in mappedByReadType[readType]: unmappedByReadType[readType][(name, readFastqFile)] = seq i = Stack( Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i)) for readType in readTypes: #build a counter of blast hits and set of read names that did not map blast_hits, no_hits = Counter(), set() for query, result in parse_blast( open(os.path.join(outputDir, readType + "_blast_out.txt"))): if result is None: no_hits.add(query) else: blast_hits[tuple( result)] += 1 #count number of times each hit was seen #write the unmapped hits to a fasta file outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w") for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems(): if name in no_hits: outf.write(">{}\n{}\n".format(name, seq)) outf.close() #write the blast report blast_out = open( os.path.join(outputDir, readType + "_blast_report.txt"), "w") blast_out.write( "gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output for result, count in sorted(blast_hits.items(), key=lambda x: -int(x[-1])): blast_out.write("{}\t{}\n".format("\t".join(result), count)) blast_out.close() #calculate percents and make a barplot blast_count = sum(blast_hits.values()) unmapped_count = len(unmappedByReadType[readType]) - sum( blast_hits.values()) mapped_count = len(mappedByReadType[readType]) #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) outf = open(os.path.join(outputDir, readType + "percents.txt"), "w") outf.write("\n".join( map(str, [blast_count, unmapped_count, mapped_count]))) outf.close() #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf"))) system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format( blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))