Example #1
0
def find_analyses(target, recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir):
    """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets"""
    files = {"template":[], "complement":[]}

    logger.info("Finding template analyses")
    for fastqFile in templateFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(), "template_" + name)
                files["template"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start : ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(Target.makeTargetFn(analyze, args=analysis))

    logger.info("Finding complement analyses")
    for fastqFile in complementFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(), "complement_" + name)
                files["complement"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start : ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(Target.makeTargetFn(analyze, args=analysis))

    target.setFollowOnTargetFn(merge, args=(files, outputDir))
Example #2
0
def find_analyses(target, recordsToAnalyze, templateFastqFiles,
                  complementFastqFiles, references, outputDir):
    """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets"""
    files = {"template": [], "complement": []}

    logger.info("Finding template analyses")
    for fastqFile in templateFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(),
                                       "template_" + name)
                files["template"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start:ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(
                    Target.makeTargetFn(analyze, args=analysis))

    logger.info("Finding complement analyses")
    for fastqFile in complementFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(),
                                       "complement_" + name)
                files["complement"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start:ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(
                    Target.makeTargetFn(analyze, args=analysis))

    target.setFollowOnTargetFn(merge, args=(files, outputDir))
Example #3
0
def getFastqDictionary(fastqFile):
    """Returns a dictionary of the first words of fastq headers to their corresponding fastq sequence
    """
    names = map(lambda x: x[0].split()[0], fastqRead(open(fastqFile, 'r')))
    assert len(names) == len(set(names))  #Check all the names are unique
    return dict(
        map(lambda x: (x[0].split()[0], x[1]),
            fastqRead(open(fastqFile, 'r'))))  #Hash of names to sequences
Example #4
0
    def __init__(self, outputDir, experiments):
        AbstractMetaAnalysis.__init__(self, outputDir, experiments)

        allReads = {(name, readFastqFile, readType, seq) for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir \
            in self.experiments for name, seq, qual in fastqRead(readFastqFile)}

        mappedReads = dict()
        for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir in self.experiments:
            for record in samIterator(
                    pysam.Samfile(os.path.join(resultsDir, "mapping.sam"))):
                if not record.is_unmapped:
                    if (record.qname, readFastqFile) not in mappedReads:
                        mappedReads[(record.qname, readFastqFile)] = set()
                    mappedReads[(record.qname, readFastqFile)].add(
                        (mapper.__name__, referenceFastaFile))

        self.reads = list()
        for name, readFastqFile, readType, seq in allReads:
            if (name, readFastqFile) in mappedReads:
                mappers, referenceFastaFiles = map(
                    tuple, zip(*mappedReads[(name, readFastqFile)]))
                self.reads.append(
                    Read(name, seq, readType, readFastqFile,
                         (mappers, referenceFastaFiles)))
            else:
                self.reads.append(
                    Read(name, seq, readType, readFastqFile, None))
def fastq_read_size(fastq_path, num_reads=10000):
    sizes = []
    fastq_handle = fastqRead(gzip.open(fastq_path))
    for i in xrange(num_reads):
        name, seq, qual = fastq_handle.next()
        sizes.append(len(seq))
    return 1.0 * sum(sizes) / len(sizes)
Example #6
0
def getFastqDictionary(fastqFile):
    """Returns a dictionary of the first words of fastq headers to their corresponding 
    fastq sequence
    """
    namesAndSequences = map(lambda x : (x[0].split()[0], x[1]), fastqRead(open(fastqFile, 'r')))
    names = map(lambda x : x[0], namesAndSequences)
    assert len(names) == len(set(names)) #Check all the names are unique
    return dict(namesAndSequences) #Hash of names to sequences
Example #7
0
def normaliseQualValues(inputFastqFile, outputFastqFile):
    """Makes a fastq with valid qual values
    """
    fileHandle = open(outputFastqFile, 'w')
    for name, seq, quals in fastqRead(open(inputFastqFile, 'r')):
        if quals == None:
            quals = [33] * len(seq)
        fastqWrite(fileHandle, name, seq, quals)
    fileHandle.close()
    return outputFastqFile
Example #8
0
def normaliseQualValues(inputFastqFile, outputFastqFile):
    """Makes a fastq with valid qual values
    """
    fileHandle = open(outputFastqFile, 'w')
    for name, seq, quals in fastqRead(open(inputFastqFile, 'r')):
        if quals == None:
            quals = [33] * len(seq)
        fastqWrite(fileHandle, name, seq, quals)
    fileHandle.close()
    return outputFastqFile
Example #9
0
 def run(self):
     refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences
     readSequences = readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences
     sam = pysam.Samfile(self.samFile, "r" )
     overallIndelCounter = IndelCounter("overall", "overall")
     for aR in sam: #Iterate on the sam lines
         refSeq = refSequences[sam.getrname(aR.rname)]
         readSeq = readSequences[aR.qname]
         overallIndelCounter.addReadAlignment(aR, refSeq, readSeq)
     sam.close()
     #Write out the substitution info
     open(os.path.join(self.outputDir, "indels.xml"), 'w').write(prettyXml(overallIndelCounter.getXML()))
Example #10
0
def makeFastqSequenceNamesUnique(inputFastqFile, outputFastqFile):
    """Makes a fastq file with unique names
    """
    names = set()
    fileHandle = open(outputFastqFile, 'w')
    for name, seq, quals in fastqRead(open(inputFastqFile, 'r')):
        name = name.split()[0]  #Get rid of any white space
        while name in names:
            logger.critical("Got a duplicate fastq sequence name: %s" % name)
            name += "i"
        names.add(name)
        fastqWrite(fileHandle, name, seq, quals)
    fileHandle.close()
    return outputFastqFile
Example #11
0
def makeFastqSequenceNamesUnique(inputFastqFile, outputFastqFile):
    """Makes a fastq file with unique names
    """
    names = set()
    fileHandle = open(outputFastqFile, 'w')
    for name, seq, quals in fastqRead(open(inputFastqFile, 'r')):
        name = name.split()[0] #Get rid of any white space
        while name in names:
            logger.critical("Got a duplicate fastq sequence name: %s" % name)
            name += "i"
        names.add(name)
        fastqWrite(fileHandle, name, seq, quals)
    fileHandle.close()
    return outputFastqFile
Example #12
0
 def run(self):
     refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences
     readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences
     overallCoverageCounter = CoverageCounter("overall", "overall") #Thing to store the overall coverage in
     readCoverages = []
     sam = pysam.Samfile(self.samFile, "r" )
     for aR in sam: #Iterate on the sam lines
         refSeq = refSequences[sam.getrname(aR.rname)]
         readSeq = readSequences[aR.qname]
         overallCoverageCounter.addReadAlignment(aR, refSeq, readSeq)
         readCoverages.append(CoverageCounter(aR.qname, sam.getrname(aR.rname)))
         readCoverages[-1].addReadAlignment(aR, refSeq, readSeq)   
     sam.close()
     #Write out the coverage info
     parentNode = overallCoverageCounter.getXML()
     for readCoverage in readCoverages:
         parentNode.append(readCoverage.getXML())
     open(os.path.join(self.outputDir, "coverages.xml"), 'w').write(prettyXml(parentNode))
     
Example #13
0
    def countKmers(self):
        refKmers, readKmers = Counter(), Counter()

        for name, seq in fastaRead(self.referenceFastaFile):
            for i in xrange(self.kmerSize, len(seq)):
                s = seq[ i - self.kmerSize : i ]
                if "N" not in s:
                    refKmers[s] += 1
                    refKmers[reverseComplement(s)] += 1


        for name, seq, qual in fastqRead(self.readFastqFile):
            for i in xrange(self.kmerSize, len(seq)):
                s = seq[ i - self.kmerSize : i ]
                if "N" not in s:
                    readKmers[s] += 1
                    readKmers[reverseComplement(s)] += 1

        return (refKmers, readKmers)
    def __init__(self, outputDir, experiments):
        AbstractMetaAnalysis.__init__(self, outputDir, experiments)
        
        allReads = {(name, readFastqFile, readType, seq) for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir \
            in self.experiments for name, seq, qual in fastqRead(readFastqFile)}
        
        mappedReads = dict()
        for readFastqFile, readType, referenceFastaFile, mapper, analyses, resultsDir in self.experiments:
            for record in samIterator(pysam.Samfile(os.path.join(resultsDir, "mapping.sam"))):
                if not record.is_unmapped:
                    if (record.qname, readFastqFile) not in mappedReads:
                        mappedReads[(record.qname, readFastqFile)] = set()
                    mappedReads[(record.qname, readFastqFile)].add((mapper.__name__, referenceFastaFile))

        self.reads = list()
        for name, readFastqFile, readType, seq in allReads:
            if (name, readFastqFile) in mappedReads:
                mappers, referenceFastaFiles = map(tuple, zip(*mappedReads[(name, readFastqFile)]))
                self.reads.append(Read(name, seq, readType, readFastqFile, (mappers, referenceFastaFiles)))
            else:
                self.reads.append(Read(name, seq, readType, readFastqFile, None))
Example #15
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    outputDir = "blast_combined/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    #find all read fastq files, load into a dict by read type
    readFastqFiles = dict()
    for readType in readTypes:
        readFastqFiles[readType] = [os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir(os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq")]
    
    #find all reference fasta files
    referenceFastaFiles = [x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa")]

    #find all sam files that were analyzed using combinedAnalyses
    samFiles = {}
    for readType in readTypes:
        samFiles[readType] = [(readFastqFile, os.path.join("../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product(readFastqFiles[readType], referenceFastaFiles, combinedAnalyses)]

    mappedByReadType = defaultdict(set)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped}
            mappedByReadType[readType] = mappedByReadType[readType].union(mappedNames)

    unmappedByReadType = defaultdict(dict)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            for name, seq, qual in fastqRead(readFastqFileFullPath):
                name = name.split(" ")[0]
                if (name, readFastqFile) not in mappedByReadType[readType]:
                    unmappedByReadType[readType][(name, readFastqFile)] = seq
        

    i = Stack(Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) 

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))

    for readType in readTypes:
        #build a counter of blast hits and set of read names that did not map
        blast_hits, no_hits = Counter(), set()
        for query, result in parse_blast(open(os.path.join(outputDir, readType + "_blast_out.txt"))):
            if result is None:
                no_hits.add(query)
            else:
                blast_hits[tuple(result)] += 1 #count number of times each hit was seen
        #write the unmapped hits to a fasta file
        outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w")
        for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems():
            if name in no_hits:
                outf.write(">{}\n{}\n".format(name, seq))
        outf.close()
        #write the blast report
        blast_out = open(os.path.join(outputDir, readType + "_blast_report.txt"), "w")
        blast_out.write("gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output
        for result, count in sorted(blast_hits.items(), key = lambda x: -int(x[-1])):
            blast_out.write("{}\t{}\n".format("\t".join(result), count))
        blast_out.close()
        #calculate percents and make a barplot
        blast_count =  sum(blast_hits.values())
        unmapped_count = len(unmappedByReadType[readType]) - sum(blast_hits.values())
        mapped_count = len(mappedByReadType[readType])
        
        #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        outf = open(os.path.join(outputDir, readType + "percents.txt"),"w")
        outf.write("\n".join(map(str,[blast_count, unmapped_count, mapped_count])))
        outf.close()
        #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
        system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
Example #16
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    outputDir = "blast_combined/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    #find all read fastq files, load into a dict by read type
    readFastqFiles = dict()
    for readType in readTypes:
        readFastqFiles[readType] = [
            os.path.join("../output/processedReadFastqFiles/", readType, x)
            for x in os.listdir(
                os.path.join("../output/processedReadFastqFiles/", readType))
            if x.endswith(".fq") or x.endswith(".fastq")
        ]

    #find all reference fasta files
    referenceFastaFiles = [
        x for x in os.listdir("../referenceFastaFiles")
        if x.endswith(".fasta") or x.endswith(".fa")
    ]

    #find all sam files that were analyzed using combinedAnalyses
    samFiles = {}
    for readType in readTypes:
        samFiles[readType] = [
            (readFastqFile,
             os.path.join(
                 "../output", "analysis_" + readType,
                 "experiment_" + os.path.basename(readFastqFile) + "_" +
                 referenceFastaFile + "_" + analysis, "mapping.sam"))
            for readFastqFile, referenceFastaFile, analysis in product(
                readFastqFiles[readType], referenceFastaFiles,
                combinedAnalyses)
        ]

    mappedByReadType = defaultdict(set)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            mappedNames = {(x.qname, readFastqFile)
                           for x in pysam.Samfile(samFile)
                           if not x.is_unmapped}
            mappedByReadType[readType] = mappedByReadType[readType].union(
                mappedNames)

    unmappedByReadType = defaultdict(dict)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            for name, seq, qual in fastqRead(readFastqFileFullPath):
                name = name.split(" ")[0]
                if (name, readFastqFile) not in mappedByReadType[readType]:
                    unmappedByReadType[readType][(name, readFastqFile)] = seq

    i = Stack(
        Target.makeTargetFn(find_analyses,
                            args=(unmappedByReadType,
                                  outputDir))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))

    for readType in readTypes:
        #build a counter of blast hits and set of read names that did not map
        blast_hits, no_hits = Counter(), set()
        for query, result in parse_blast(
                open(os.path.join(outputDir, readType + "_blast_out.txt"))):
            if result is None:
                no_hits.add(query)
            else:
                blast_hits[tuple(
                    result)] += 1  #count number of times each hit was seen
        #write the unmapped hits to a fasta file
        outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w")
        for (name,
             readFastqFile), seq in unmappedByReadType[readType].iteritems():
            if name in no_hits:
                outf.write(">{}\n{}\n".format(name, seq))
        outf.close()
        #write the blast report
        blast_out = open(
            os.path.join(outputDir, readType + "_blast_report.txt"), "w")
        blast_out.write(
            "gi|##|gb|##|\tSpecies\tseqID\tCount\n")  #header to output
        for result, count in sorted(blast_hits.items(),
                                    key=lambda x: -int(x[-1])):
            blast_out.write("{}\t{}\n".format("\t".join(result), count))
        blast_out.close()
        #calculate percents and make a barplot
        blast_count = sum(blast_hits.values())
        unmapped_count = len(unmappedByReadType[readType]) - sum(
            blast_hits.values())
        mapped_count = len(mappedByReadType[readType])

        #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        outf = open(os.path.join(outputDir, readType + "percents.txt"), "w")
        outf.write("\n".join(
            map(str, [blast_count, unmapped_count, mapped_count])))
        outf.close()
        #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
        system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(
            blast_count, unmapped_count, mapped_count, readType,
            os.path.join(outputDir, readType + "_blast_barplot.pdf")))