def main(args):
    args = parse_args(args)

    G = DeBruijnGraph(args.kmer_size)

    # first pass adds nodes
    for name, seq in fastaRead(args.reference):
        name, offset = name.split("_")[:2]
        G.constructNodes(name, offset, seq)
    # second pass constructs adjacenices
    for name, seq in fastaRead(args.reference):
        G.constructAdjacencies(seq)

    for name, seq in fastaRead(args.normalizing):
        G.addNormalizing(name, seq)

    if args.bad_kmers is not None:
        G.flagNodes(args.bad_kmers)

    if args.weights is not None:
        with open(args.weights) as f:
            G.weightKmers(pickle.load(f))

    G.finishBuild()
    G.pruneGraph()

    pickle.dump(G, args.out)
def main(args):
    args = parse_args(args)
    
    G = DeBruijnGraph(args.kmer_size)
    
    # first pass adds nodes
    for name, seq in fastaRead(args.reference):
        name, offset = name.split("_")[:2]
        G.constructNodes(name, offset, seq)
    # second pass constructs adjacenices
    for name, seq in fastaRead(args.reference):
        G.constructAdjacencies(seq)

    for name, seq in fastaRead(args.normalizing):
        G.addNormalizing(name, seq)
        
    if args.bad_kmers is not None:
        G.flagNodes(args.bad_kmers)

    if args.weights is not None:
        with open(args.weights) as f:
            G.weightKmers(pickle.load(f))

    G.finishBuild()
    G.pruneGraph()
    
    pickle.dump(G, args.out)
Example #3
0
def getFastaDictionary(fastaFile):
    """Returns a dictionary of the first words of fasta headers to their corresponding fasta sequence
    """
    names = map(lambda x: x[0].split()[0], fastaRead(open(fastaFile, 'r')))
    assert len(names) == len(set(names))  #Check all the names are unique
    return dict(
        map(lambda x: (x[0].split()[0], x[1]),
            fastaRead(open(fastaFile, 'r'))))  #Hash of names to sequences
Example #4
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        try:
            command = "echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
                   (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                    options.gapGamma, options.matchGamma, outputCigarFile);
            system(command)
            # target.logToMaster('[good] ' + command + '\n');
        except Exception, e:            
            target.logToMaster('Caught an exception! qname = "%s"\n' % querySequenceName);
            target.logToMaster('len(exonerateCigarString[:-1]) = %d\n' % (len(exonerateCigarString[:-1])));
            target.logToMaster('[bad] Command that caused the exception:\n');
            target.logToMaster("echo %s | cPecanRealign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile));
            target.logToMaster('\n');
            target.logToMaster('\n');
            target.logToMaster(str(e) + '\n');
            target.logToMaster('\n');
            continue;
Example #5
0
def getFastaDictionary(fastaFile):
    """Returns a dictionary of the first words of fasta headers to their corresponding
    fasta sequence
    """
    namesAndSequences = map(lambda x : (x[0].split()[0], x[1]), fastaRead(open(fastaFile, 'r')))
    names = map(lambda x : x[0], namesAndSequences)
    assert len(names) == len(set(names)) #Check all the names are unique
    return dict(namesAndSequences) #Hash of names to sequences
Example #6
0
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile,
                                            referenceSequenceName,
                                            referenceSequence,
                                            querySequenceFile,
                                            outputPosteriorProbsFile, options):
    """Calculates the posterior probabilities of matches in a set of pairwise
    alignments between a reference sequence and a set of reads. 
    """
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")

    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence)

    #Hash to store posterior probabilities in
    expectationsOfBasesAtEachPosition = {}

    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(),
                                              "posteriorProbs.txt")
        if options.noMargin:  #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs
            #This runtime should be very fast
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \
            --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s"                                                                                                % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile,
                        tempPosteriorProbsFile))
        else:
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s"                                                                                       % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile,
                        tempPosteriorProbsFile, options.alignmentModel))

        #Now collate the reference position expectations
        for refPosition, queryPosition, posteriorProb in \
        map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')):
            assert posteriorProb <= 1.01
            assert posteriorProb >= 0.0
            key = (referenceSequenceName, int(refPosition))
            if key not in expectationsOfBasesAtEachPosition:
                expectationsOfBasesAtEachPosition[key] = dict(
                    zip(BASES, [0.0] * len(BASES)))
            queryBase = querySequence[int(queryPosition)].upper()
            if queryBase in BASES:  #Could be an N or other wildcard character, which we ignore
                expectationsOfBasesAtEachPosition[key][
                    queryBase] += 1.0 if options.noMargin else posteriorProb

    #Pickle the posterior probs
    fileHandle = open(outputPosteriorProbsFile, 'w')
    cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle,
                 cPickle.HIGHEST_PROTOCOL)
    fileHandle.close()
Example #7
0
def merge(target, files, outputDir):
    """merges all muscle output into one fasta and runs metrics() on each"""
    for typeof in files:
        outmetrics = open(os.path.join(outputDir, typeof + "_metrics.tsv"), "w")
        outmetrics.write("Read\tReference\tMatches\tMismatches\tReadDeletionLength\tReadInsertionLength\tIdentity\tReferenceCoverage\n")
        for f in files[typeof]:
            handle = fastaRead(f)
            name, seq = handle.next()
            ref_name, ref_seq = handle.next()
            name = name.lstrip(">"); ref_name = ref_name.lstrip(">")
            outmetrics.write("\t".join([name, ref_name] + metrics(seq, ref_seq))); outmetrics.write("\n")
        outmetrics.close()
Example #8
0
 def run(self):
     refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences
     readSequences = readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences
     sam = pysam.Samfile(self.samFile, "r" )
     overallIndelCounter = IndelCounter("overall", "overall")
     for aR in sam: #Iterate on the sam lines
         refSeq = refSequences[sam.getrname(aR.rname)]
         readSeq = readSequences[aR.qname]
         overallIndelCounter.addReadAlignment(aR, refSeq, readSeq)
     sam.close()
     #Write out the substitution info
     open(os.path.join(self.outputDir, "indels.xml"), 'w').write(prettyXml(overallIndelCounter.getXML()))
Example #9
0
def makeFastaSequenceNamesUnique(inputFastaFile, outputFastaFile):
    """Makes a fasta file with unique names
    """
    names = set()
    fileHandle = open(outputFastaFile, 'w')
    for name, seq in fastaRead(open(inputFastaFile, 'r')):
        while name in names:
            logger.critical("Got a duplicate fasta sequence name: %s" % name)
            name += "i"
        names.add(name)
        fastaWrite(fileHandle, name, seq)
    fileHandle.close()
    return outputFastaFile
Example #10
0
def makeFastaSequenceNamesUnique(inputFastaFile, outputFastaFile):
    """Makes a fasta file with unique names
    """
    names = set()
    fileHandle = open(outputFastaFile, 'w')
    for name, seq in fastaRead(open(inputFastaFile, 'r')):
        while name in names:
            logger.critical("Got a duplicate fasta sequence name: %s" % name)
            name += "i"
        names.add(name)
        fastaWrite(fileHandle, name, seq)
    fileHandle.close()
    return outputFastaFile
Example #11
0
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, 
                referenceSequenceName, referenceSequence, querySequenceFile, 
                outputPosteriorProbsFile, options):
    """Calculates the posterior probabilities of matches in a set of pairwise
    alignments between a reference sequence and a set of reads. 
    """
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #Hash to store posterior probabilities in
    expectationsOfBasesAtEachPosition = {}
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt")
        if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs
            #This runtime should be very fast
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \
            --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile, 
                        tempPosteriorProbsFile))
        else:
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile, 
                        tempPosteriorProbsFile, options.alignmentModel))
        
        #Now collate the reference position expectations
        for refPosition, queryPosition, posteriorProb in \
        map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')):
            assert posteriorProb <= 1.01
            assert posteriorProb >= 0.0
            key = (referenceSequenceName, int(refPosition))
            if key not in expectationsOfBasesAtEachPosition:
                expectationsOfBasesAtEachPosition[key] = dict(zip(BASES, [0.0]*len(BASES)))
            queryBase = querySequence[int(queryPosition)].upper()
            if queryBase in BASES: #Could be an N or other wildcard character, which we ignore
                expectationsOfBasesAtEachPosition[key][queryBase] += 1.0 if options.noMargin else posteriorProb 
            
    #Pickle the posterior probs
    fileHandle = open(outputPosteriorProbsFile, 'w')
    cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL)
    fileHandle.close() 
Example #12
0
def merge(target, files, outputDir):
    """merges all muscle output into one fasta and runs metrics() on each"""
    for typeof in files:
        outmetrics = open(os.path.join(outputDir, typeof + "_metrics.tsv"),
                          "w")
        outmetrics.write(
            "Read\tReference\tMatches\tMismatches\tReadDeletionLength\tReadInsertionLength\tIdentity\tReferenceCoverage\n"
        )
        for f in files[typeof]:
            handle = fastaRead(f)
            name, seq = handle.next()
            ref_name, ref_seq = handle.next()
            name = name.lstrip(">")
            ref_name = ref_name.lstrip(">")
            outmetrics.write("\t".join([name, ref_name] +
                                       metrics(seq, ref_seq)))
            outmetrics.write("\n")
        outmetrics.close()
Example #13
0
 def run(self):
     refSequences = dict(fastaRead(open(self.referenceFastaFile, 'r'))) #Hash of names to sequences
     readSequences = dict([ (name, seq) for name, seq, quals in fastqRead(self.readFastqFile) ]) #Hash of names to sequences
     overallCoverageCounter = CoverageCounter("overall", "overall") #Thing to store the overall coverage in
     readCoverages = []
     sam = pysam.Samfile(self.samFile, "r" )
     for aR in sam: #Iterate on the sam lines
         refSeq = refSequences[sam.getrname(aR.rname)]
         readSeq = readSequences[aR.qname]
         overallCoverageCounter.addReadAlignment(aR, refSeq, readSeq)
         readCoverages.append(CoverageCounter(aR.qname, sam.getrname(aR.rname)))
         readCoverages[-1].addReadAlignment(aR, refSeq, readSeq)   
     sam.close()
     #Write out the coverage info
     parentNode = overallCoverageCounter.getXML()
     for readCoverage in readCoverages:
         parentNode.append(readCoverage.getXML())
     open(os.path.join(self.outputDir, "coverages.xml"), 'w').write(prettyXml(parentNode))
     
Example #14
0
    def countKmers(self):
        refKmers, readKmers = Counter(), Counter()

        for name, seq in fastaRead(self.referenceFastaFile):
            for i in xrange(self.kmerSize, len(seq)):
                s = seq[ i - self.kmerSize : i ]
                if "N" not in s:
                    refKmers[s] += 1
                    refKmers[reverseComplement(s)] += 1


        for name, seq, qual in fastqRead(self.readFastqFile):
            for i in xrange(self.kmerSize, len(seq)):
                s = seq[ i - self.kmerSize : i ]
                if "N" not in s:
                    readKmers[s] += 1
                    readKmers[reverseComplement(s)] += 1

        return (refKmers, readKmers)
Example #15
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        system("echo \"%s\" | cPecanRealign %s %s --diagonalExpansion=10 \
        --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
               (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                options.gapGamma, options.matchGamma, outputCigarFile))
Example #16
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
        --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
               (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                options.gapGamma, options.matchGamma, outputCigarFile))
Example #17
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    outputDir = "muscle_compare_2d/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    if len(args) != 3:
        raise RuntimeError("Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args)))

    templateRecords = {x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped}
    complementRecords = {x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped}
    
    twodSamFile = pysam.Samfile(args[2])
    twodRecords = {x.qname : x for x in twodSamFile if not x.is_unmapped}

    recordsToAnalyze = dict()
    for name, record in twodRecords.iteritems():
        if name not in templateRecords and name not in complementRecords:
            ref_name = twodSamFile.getrname(record.tid)
            ref_start, ref_stop = int(record.aend - record.alen), int(record.aend)
            recordsToAnalyze[name] = [ref_name, ref_start, ref_stop]
    if os.path.exists("../readFastqFiles/template/") and os.path.exists("../readFastqFiles/complement"):
        templateFastqFiles = [os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq")]
        complementFastqFiles = [os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq")]
    else:
        raise RuntimeError("Error: readFastqFiles does not contain template and/or complement folders")

    referenceFastaFiles = [os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta")]
    
    if len(referenceFastaFiles) > 0:
        references = { y[0].split(" ")[0] : y[1] for x in referenceFastaFiles for y in fastaRead(x) }
    else:
        raise RuntimeError("Error: no reference fasta files")

    if len(recordsToAnalyze) == 0:
        raise RuntimeError("Error: none of the mappable twoD reads in this set did not map as template/complement.")

    logger.info("Starting to find analyses to run...")
    args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir)
    i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) 

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))
Example #18
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    outputDir = "muscle_compare_2d/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    if len(args) != 3:
        raise RuntimeError(
            "Error: expected three arguments got %s arguments: %s" %
            (len(args), " ".join(args)))

    templateRecords = {
        x.qname
        for x in pysam.Samfile(args[0]) if not x.is_unmapped
    }
    complementRecords = {
        x.qname
        for x in pysam.Samfile(args[1]) if not x.is_unmapped
    }

    twodSamFile = pysam.Samfile(args[2])
    twodRecords = {x.qname: x for x in twodSamFile if not x.is_unmapped}

    recordsToAnalyze = dict()
    for name, record in twodRecords.iteritems():
        if name not in templateRecords and name not in complementRecords:
            ref_name = twodSamFile.getrname(record.tid)
            ref_start, ref_stop = int(record.aend - record.alen), int(
                record.aend)
            recordsToAnalyze[name] = [ref_name, ref_start, ref_stop]
    if os.path.exists("../readFastqFiles/template/") and os.path.exists(
            "../readFastqFiles/complement"):
        templateFastqFiles = [
            os.path.join("../readFastqFiles/template/", x)
            for x in os.listdir("../readFastqFiles/template/")
            if x.endswith(".fastq") or x.endswith(".fq")
        ]
        complementFastqFiles = [
            os.path.join("../readFastqFiles/complement/", x)
            for x in os.listdir("../readFastqFiles/complement/")
            if x.endswith(".fastq") or x.endswith(".fq")
        ]
    else:
        raise RuntimeError(
            "Error: readFastqFiles does not contain template and/or complement folders"
        )

    referenceFastaFiles = [
        os.path.join("../referenceFastaFiles", x)
        for x in os.listdir("../referenceFastaFiles")
        if x.endswith(".fa") or x.endswith(".fasta")
    ]

    if len(referenceFastaFiles) > 0:
        references = {
            y[0].split(" ")[0]: y[1]
            for x in referenceFastaFiles for y in fastaRead(x)
        }
    else:
        raise RuntimeError("Error: no reference fasta files")

    if len(recordsToAnalyze) == 0:
        raise RuntimeError(
            "Error: none of the mappable twoD reads in this set did not map as template/complement."
        )

    logger.info("Starting to find analyses to run...")
    args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles,
            references, outputDir)
    i = Stack(Target.makeTargetFn(find_analyses,
                                  args=args)).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))