def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: inputFastaFile outputFastaFile outputMutationsFile [options]", version="%prog 0.1") parser.add_option("--snpRate", dest="snpRate", help="The probability of introducing a random different base at each position", default=0.2, type=float) #Parse the options/arguments options, args = parser.parse_args() #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #This call gets the mutated sequences and a list of mutations mutatedSequences, allMutations = mutateSequences(getFastaDictionary(args[0]), options.snpRate) #Write out the mutated sequences into the given file fH = open(args[1], 'w') for name in mutatedSequences: fastaWrite(fH, name, mutatedSequences[name]) fH.close() #Write out mutations fH = open(args[2], 'w') for mutation in allMutations: fH.write("\t".join(map(str, mutation)) + "\n") fH.close()
def Substitutions(readFastqFile, referenceFastaFile, samFile, outputDir, kmer=6): """Calculates stats on substitutions """ refSequences = getFastaDictionary( referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary( readFastqFile) #Hash of names to sequences sM = SubstitutionMatrix() #The thing to store the counts in sam = pysam.Samfile(samFile, "r") for aR in samIterator(sam): #Iterate on the sam lines for aP in AlignedPair.iterator(aR, refSequences[sam.getrname( aR.rname)], readSequences[ aR.qname]): #Walk through the matches mismatches: sM.addAlignedPair(aP.getRefBase(), aP.getReadBase()) sam.close() #Write out the substitution info open(os.path.join(outputDir, "substitutions.xml"), 'w').write(prettyXml(sM.getXML())) bases = "ACGT" outf = open(os.path.join(outputDir, "subst.tsv"), "w") outf.write("A\tC\tG\tT\n") for x in bases: freqs = sM.getFreqs(x, bases) outf.write("{}\t{}\n".format(x, "\t".join(map(str, freqs)), "\n")) outf.close() analysis = str(samFile.split("/")[-1].split(".sam")[0]) system("Rscript scripts/substitution_plot.R {} {} {}".format( os.path.join(outputDir, "subst.tsv"), os.path.join(outputDir, "substitution_plot.pdf"), analysis))
def validateVcf(self, vcfFile, referenceFastaFile, mutationsFile): #Load reference sequences referenceSequences = getFastaDictionary(referenceFastaFile) #Load mutations mutations = set(map(lambda x : (x[0], int(x[1])+1, x[2]), \ map(lambda x : x.split(), open(mutationsFile, 'r')))) #Load VCF mutations imputedMutations = vcfRead(vcfFile) #print "Known mutations", sorted(list(mutations)) #print "Imputed mutations", sorted(list(imputedMutations)) #Compare mutation sets intersectionSize = float(len(mutations.intersection(imputedMutations))) #Return precision, recall, number of mutations called, number of known mutations return intersectionSize/len(imputedMutations) if len(imputedMutations) else 0.0, \ intersectionSize/len(mutations) if len(mutations) else 0.0, len(imputedMutations), len(mutations)
def main(): #Parse the inputs args/options parser = OptionParser( usage= "usage: inputFastaFile outputFastaFile outputMutationsFile [options]", version="%prog 0.1") parser.add_option( "--snpRate", dest="snpRate", help= "The probability of introducing a random different base at each position", default=0.2, type=float) #Parse the options/arguments options, args = parser.parse_args() #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #This call gets the mutated sequences and a list of mutations mutatedSequences, allMutations = mutateSequences( getFastaDictionary(args[0]), options.snpRate) #Write out the mutated sequences into the given file fH = open(args[1], 'w') for name in mutatedSequences: fastaWrite(fH, name, mutatedSequences[name]) fH.close() #Write out mutations fH = open(args[2], 'w') for mutation in allMutations: fH.write("\t".join(map(str, mutation)) + "\n") fH.close()
def check_alignments(self, true_alignments, reads, reference, kmer_length, contig_name, extra_args=None): def get_kmer(start): return referece_sequence[start:start + kmer_length] assert len( glob.glob(reads + "*.fast5")) > 0, "Didn't find zymo test MinION reads" assert os.path.isfile(reference), "Didn't find zymo reference sequence" alignment_command = "./runSignalAlign -d={reads} -r={ref} -smt=threeState -o={testDir} " \ "".format(reads=reads, ref=reference, testDir="./signalAlign_unittest/") if extra_args is not None: alignment_command += extra_args null_output = open(os.devnull, 'w') result = call(alignment_command, shell=True, bufsize=-1, stdout=null_output, stderr=null_output) self.assertTrue( result == 0, "error running signalAlign alignments command was {}" "".format(alignment_command)) test_alignments = glob.glob( "./signalAlign_unittest/tempFiles_alignment/*.tsv") referece_sequence = getFastaDictionary(reference)[contig_name] self.assertTrue( len(test_alignments) == len(glob.glob(true_alignments + "*.tsv")), "Didn't make all alignments got {got} should be {should}".format( got=len(test_alignments), should=len(glob.glob(true_alignments + "*.tsv")))) for alignment in test_alignments: alignment_file = alignment.split("/")[-1] expected = parse_alignment_full(true_alignments + alignment_file) obs = parse_alignment_full(alignment) self.assertTrue(len(obs) == len(expected)) for row in obs.itertuples(): ref_pos = row[1] obs_kmer = row[2] strand = row[3] exp_kmer = get_kmer(ref_pos) self.assertTrue( obs_kmer == exp_kmer, msg= "kmer at index {idx} on strand {strand} is {obs} should be " "{exp}, file {f}".format(idx=ref_pos, strand=strand, obs=obs_kmer, exp=exp_kmer, f=alignment))