def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: inputFastaFile outputFastaFile outputMutationsFile [options]", 
                          version="%prog 0.1")

    parser.add_option("--snpRate", dest="snpRate", 
                      help="The probability of introducing a random different base at each position", 
                      default=0.2, type=float)

    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
 
    #This call gets the mutated sequences and a list of mutations
    mutatedSequences, allMutations = mutateSequences(getFastaDictionary(args[0]), options.snpRate)
 
    #Write out the mutated sequences into the given file
    fH = open(args[1], 'w')
    for name in mutatedSequences:
        fastaWrite(fH, name, mutatedSequences[name])
    fH.close()
    
    #Write out mutations
    fH = open(args[2], 'w')
    for mutation in allMutations:
        fH.write("\t".join(map(str, mutation)) + "\n")
    fH.close()
Exemple #2
0
def Substitutions(readFastqFile,
                  referenceFastaFile,
                  samFile,
                  outputDir,
                  kmer=6):
    """Calculates stats on substitutions
    """
    refSequences = getFastaDictionary(
        referenceFastaFile)  #Hash of names to sequences
    readSequences = getFastqDictionary(
        readFastqFile)  #Hash of names to sequences
    sM = SubstitutionMatrix()  #The thing to store the counts in
    sam = pysam.Samfile(samFile, "r")
    for aR in samIterator(sam):  #Iterate on the sam lines
        for aP in AlignedPair.iterator(aR, refSequences[sam.getrname(
                aR.rname)], readSequences[
                    aR.qname]):  #Walk through the matches mismatches:
            sM.addAlignedPair(aP.getRefBase(), aP.getReadBase())
    sam.close()

    #Write out the substitution info
    open(os.path.join(outputDir, "substitutions.xml"),
         'w').write(prettyXml(sM.getXML()))
    bases = "ACGT"
    outf = open(os.path.join(outputDir, "subst.tsv"), "w")
    outf.write("A\tC\tG\tT\n")
    for x in bases:
        freqs = sM.getFreqs(x, bases)
        outf.write("{}\t{}\n".format(x, "\t".join(map(str, freqs)), "\n"))
    outf.close()
    analysis = str(samFile.split("/")[-1].split(".sam")[0])
    system("Rscript scripts/substitution_plot.R {} {} {}".format(
        os.path.join(outputDir, "subst.tsv"),
        os.path.join(outputDir, "substitution_plot.pdf"), analysis))
Exemple #3
0
    def validateVcf(self, vcfFile, referenceFastaFile, mutationsFile):
        #Load reference sequences
        referenceSequences = getFastaDictionary(referenceFastaFile)
        #Load mutations
        mutations = set(map(lambda x : (x[0], int(x[1])+1, x[2]), \
                            map(lambda x : x.split(), open(mutationsFile, 'r'))))
        #Load VCF mutations
        imputedMutations = vcfRead(vcfFile)

        #print "Known mutations", sorted(list(mutations))
        #print "Imputed mutations", sorted(list(imputedMutations))

        #Compare mutation sets
        intersectionSize = float(len(mutations.intersection(imputedMutations)))
        #Return precision, recall, number of mutations called, number of known mutations
        return intersectionSize/len(imputedMutations) if len(imputedMutations) else 0.0, \
    intersectionSize/len(mutations) if len(mutations) else 0.0, len(imputedMutations), len(mutations)
Exemple #4
0
    def validateVcf(self, vcfFile, referenceFastaFile, mutationsFile):
        #Load reference sequences
        referenceSequences = getFastaDictionary(referenceFastaFile)
        #Load mutations
        mutations = set(map(lambda x : (x[0], int(x[1])+1, x[2]), \
                            map(lambda x : x.split(), open(mutationsFile, 'r'))))
        #Load VCF mutations
        imputedMutations = vcfRead(vcfFile)

        #print "Known mutations", sorted(list(mutations))
        #print "Imputed mutations", sorted(list(imputedMutations))

        #Compare mutation sets
        intersectionSize = float(len(mutations.intersection(imputedMutations)))
        #Return precision, recall, number of mutations called, number of known mutations
        return intersectionSize/len(imputedMutations) if len(imputedMutations) else 0.0, \
    intersectionSize/len(mutations) if len(mutations) else 0.0, len(imputedMutations), len(mutations)
Exemple #5
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(
        usage=
        "usage: inputFastaFile outputFastaFile outputMutationsFile [options]",
        version="%prog 0.1")

    parser.add_option(
        "--snpRate",
        dest="snpRate",
        help=
        "The probability of introducing a random different base at each position",
        default=0.2,
        type=float)

    #Parse the options/arguments
    options, args = parser.parse_args()

    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" %
                           " ".join(args))

    #This call gets the mutated sequences and a list of mutations
    mutatedSequences, allMutations = mutateSequences(
        getFastaDictionary(args[0]), options.snpRate)

    #Write out the mutated sequences into the given file
    fH = open(args[1], 'w')
    for name in mutatedSequences:
        fastaWrite(fH, name, mutatedSequences[name])
    fH.close()

    #Write out mutations
    fH = open(args[2], 'w')
    for mutation in allMutations:
        fH.write("\t".join(map(str, mutation)) + "\n")
    fH.close()
Exemple #6
0
    def check_alignments(self,
                         true_alignments,
                         reads,
                         reference,
                         kmer_length,
                         contig_name,
                         extra_args=None):
        def get_kmer(start):
            return referece_sequence[start:start + kmer_length]

        assert len(
            glob.glob(reads +
                      "*.fast5")) > 0, "Didn't find zymo test MinION reads"
        assert os.path.isfile(reference), "Didn't find zymo reference sequence"

        alignment_command = "./runSignalAlign -d={reads} -r={ref} -smt=threeState -o={testDir} " \
                            "".format(reads=reads, ref=reference, testDir="./signalAlign_unittest/")
        if extra_args is not None:
            alignment_command += extra_args

        null_output = open(os.devnull, 'w')
        result = call(alignment_command,
                      shell=True,
                      bufsize=-1,
                      stdout=null_output,
                      stderr=null_output)

        self.assertTrue(
            result == 0, "error running signalAlign alignments command was {}"
            "".format(alignment_command))

        test_alignments = glob.glob(
            "./signalAlign_unittest/tempFiles_alignment/*.tsv")

        referece_sequence = getFastaDictionary(reference)[contig_name]

        self.assertTrue(
            len(test_alignments) == len(glob.glob(true_alignments + "*.tsv")),
            "Didn't make all alignments got {got} should be {should}".format(
                got=len(test_alignments),
                should=len(glob.glob(true_alignments + "*.tsv"))))

        for alignment in test_alignments:
            alignment_file = alignment.split("/")[-1]
            expected = parse_alignment_full(true_alignments + alignment_file)
            obs = parse_alignment_full(alignment)
            self.assertTrue(len(obs) == len(expected))
            for row in obs.itertuples():
                ref_pos = row[1]
                obs_kmer = row[2]
                strand = row[3]
                exp_kmer = get_kmer(ref_pos)
                self.assertTrue(
                    obs_kmer == exp_kmer,
                    msg=
                    "kmer at index {idx} on strand {strand} is {obs} should be "
                    "{exp}, file {f}".format(idx=ref_pos,
                                             strand=strand,
                                             obs=obs_kmer,
                                             exp=exp_kmer,
                                             f=alignment))