def find_csr(args, pool): # TODOL # validate the params # throw error if program returns error tmpDir = tempfile.mkdtemp(prefix="seanome_", dir=".") nhmmer_dir = os.path.join(tmpDir, "nhmmer_dir") os.mkdir(nhmmer_dir) aliFiles = os.listdir(args.alignments_dir) logging.debug("Finding %s alignments from %s in ref %s: \n" % (len(aliFiles), args.alignments_dir, args.genome)) pool.map(runInstance, [ ProgramRunner("nhmmer", [ os.path.join(args.alignments_dir, x), args.genome, os.path.join(nhmmer_dir, x.split(".")[0]) ]) for x in aliFiles ]) logging.debug("Parsing outputs") nhmmerToAli = NHMMER_TO_ALI(args, args.alignments_dir, nhmmer_dir, args.genome, args.fasta_output) nhmmerToAli.run() logging.debug("Aligning hits") #Run mutiple sequence alignment pool.map(runInstance, [ ProgramRunner("muscle", [ os.path.join(args.fasta_output, x), os.path.join(args.alis_output, x) ]) for x in aliFiles ]) logging.debug("Finished finding common shared regions between %s and %s" % (args.alignments_dir, args.genome))
def find_seed_csr(args, pool): outName = os.path.splitext(os.path.basename(args.input_1))[0]+"_"+os.path.splitext(os.path.basename(args.input_2))[0]+".lastz" logging.debug("Finding seed common shared regions between %s and %s: \n Starting... " % (args.input_1, args.input_2)) prog = ProgramRunner("lastz", [args.input_1, args.input_2, args.min_csr_sim, outName]) prog.run() generateAlis(args, outName) logging.debug("Finished finding seed common shared regions between %s and %s" % (args.input_1, args.input_2))
def find_seed_csr(args, pool): outName = os.path.splitext(os.path.basename( args.input_1))[0] + "_" + os.path.splitext( os.path.basename(args.input_2))[0] + ".lastz" logging.debug( "Finding seed common shared regions between %s and %s: \n Starting... " % (args.input_1, args.input_2)) prog = ProgramRunner( "lastz", [args.input_1, args.input_2, args.min_csr_sim, outName]) prog.run() generateAlis(args, outName) logging.debug( "Finished finding seed common shared regions between %s and %s" % (args.input_1, args.input_2))
def consensus(args, pool): aliFiles = os.listdir(args.alis_dir) pool.map(runInstance, [ ProgramRunner("addConsensus", [ os.path.join(args.alis_dir, x), os.path.join(args.cons_output, x) ]) for x in aliFiles ])
def resolveMultipleHits(args, pool): # Cluster each sample independently and then merges all representatives in allReps.fasta repsDir = "Reps" repsFasta = "allReps.fasta" # fasta file for all cluster respresentatives repsClustersDir= "Clusters" # directoy containing output of clustering correctedResultsDir = "correctedMultiplesHits" logging.debug("resolveMultipleHits: Started process for resolving multiple hits") samples = [sample.rstrip() for sample in open(args.samplesFile.name, 'r')] makeDirOrdie(os.path.join(args.clustersDir, "clusters",)) pool.map(runInstance, [ProgramRunner("CLUSTER_COMMAND", [os.path.join(args.multipleFastaDir, sample+".fasta"), os.path.join(args.clustersDir, "clusters", sample)]) for sample in samples]) logging.debug("resolveMultipleHits: Done clustering of multiple reads") # Directory where all cluster representatives will be set up makeDirOrdie(os.path.join(args.clustersDir, repsDir)) allRepsFasta = open(os.path.join(args.clustersDir, repsDir, repsFasta), 'w') # WARNING this shoud work fine as long as each clstr file not very large. for sample in samples: with open(os.path.join(args.clustersDir, "clusters", sample)) as infile: allRepsFasta.write(infile.read()) allRepsFasta.close() logging.debug("resolveMultipleHits: clustering concatenated all reps") #Now Cluster all the representatives makeDirOrdie(os.path.join(args.clustersDir, repsDir, repsClustersDir)) runInstance(ProgramRunner("CLUSTER_COMMAND", [os.path.join(args.clustersDir, repsDir, repsFasta), os.path.join(args.clustersDir, repsDir, repsClustersDir, repsFasta)])) logging.debug("resolveMultipleHits: done clustering the reps %s" % os.path.join(args.clustersDir, repsDir, repsFasta)) cdHitParser = CD_HitParser("data/samples.ids", "data/resolveMultiples/Reps/Clusters/allReps.fasta.clstr", "data/resolveMultiples/clusters/", "data/blastResults/MULTIPLE/",) makeDirOrdie(os.path.join(args.clustersDir, correctedResultsDir)) cdHitParser.run(os.path.join(args.clustersDir, correctedResultsDir)) logging.debug("resolveMultipleHits: Finished resolving multiple hits")
def processSubtype(args, pool): logging.debug("SYBTYPE: Running blast for subtyping") samples = [sample.rstrip() for sample in open(args.samplesFile.name, 'r')] makeDirOrdie(args.blastOutDir) pool.map(runInstance, [ProgramRunner("BLAST_COMMAND", [os.path.join(args.hitsDir, sample+".fasta"), blast_db, os.path.join(args.blastOutDir, sample+".out") ] ) for sample in samples]) logging.debug("SUBTYPE: Done running blast for subtyping") logging.debug("SUBTYPE: Parsing blast output files") makeDirOrdie(args.blastResults) makeDirOrdie(os.path.join(args.blastResults,"PERFECT")) makeDirOrdie(os.path.join(args.blastResults,"UNIQUE")) makeDirOrdie(os.path.join(args.blastResults,"MULTIPLE")) makeDirOrdie(os.path.join(args.blastResults,"SHORT")) makeDirOrdie(os.path.join(args.blastResults,"NEW")) makeDirOrdie(os.path.join(args.blastResults,"SHORTNEW")) pool.map(runInstance, [BlastParser( os.path.join(args.blastOutDir, sample+".out"), args.blastResults) for sample in samples]) logging.debug("SUBTYPE: Parsing blast output files") logging.debug("SUBTYPE:Generating formatted output") generateSubtypeCounts(args.samplesFile.name, args.blastResults, "UNIQUE") generateSubtypeCounts(args.samplesFile.name, args.blastResults, "PERFECT") # The SHORTNEW Has a different meaning. # HIS does not show the distribution of subtype hits, but the closest one that these short new seqeunces are hitting # However the hits are not significant enough to assign them to them generateSubtypeCounts(args.samplesFile.name, args.blastResults, "SHORTNEW") logging.debug("SUBTYPE:Done Generating formatted output") logging.debug("SUBTYPE:Extracting multiple hits") splitFastaRegEx = os.path.join(args.fastaFilesDir, "%s"+".fasta") inFileRegEx = os.path.join(args.blastResults,"MULTIPLE", "%s"+".out") makeDirOrdie(os.path.join(args.blastResults,"MULTIPLE", "fasta")) outputFastaRegEx= os.path.join(args.blastResults,"MULTIPLE", "fasta", "%s"+".fasta") extractSeqsFromHits(splitFastaRegEx, inFileRegEx, outputFastaRegEx, 0, args.samplesFile.name, pool) logging.debug("SUBTYPE:Done Extracting multiple hits") logging.debug("SUBTYPE:Completed")
def maskGenome(args, pool): print args.input name = os.path.splitext(os.path.basename(args.input))[0] # Can eventually be parallelized using the pool of threads logging.debug("Starting the masking of input %s" % args.input) # run logging.debug("Computing frequencies for finding repeats" ) prog = ProgramRunner("build_lmer_table", [args.input, name+".freqs" ] ) prog.run() logging.debug("Finding repeats in the %s" % args.input) prog = ProgramRunner("repeatScout", [args.input, name+"_repeats.fa", name+".freqs" ] ) prog.run() # Filter the repeats that do not pass the requirements # for now, this only consists in dropping short reads < N logging.debug("Filetering repeats") repeats = SeqIO.parse(name+"_repeats.fa", 'fasta') filterReads=[] for read in repeats: if len(read.seq) >= args.min_length: filterReads.append(read) SeqIO.write(filterReads, open(name+"_repeats_filtered.fa", 'w'), 'fasta') if len(filterReads) >= 1: prog = ProgramRunner("bowtie-build", [args.input, name]) prog.run() prog = ProgramRunner("bowtie-align", [name, name+"_repeats_filtered.fa", name+"_repeats_filtered.sam"]) prog.run() maskFile(args, name+"_repeats_filtered.sam") else: logging.debug("No repeats found is %s" % args.input) logging.debug("Done masking input %s" % args.input)
def processClades(args, pool=Pool(processes=1)): logging.debug('CLADE:Processing caldes for: %s ' % args.inFile.name) # Split fasta file # Put all the directories is at the same level as the inFile. fastaFilesDir = os.path.join(os.path.dirname(args.inFile.name), "fasta") fastaList = Helpers.splitFileBySample(args.inFile.name, args.samplesFile.name, fastaFilesDir) # Running HMMscan hmmerOutputDir = os.path.join(os.path.dirname(args.inFile.name), "hmmer_output") makeDirOrdie(hmmerOutputDir) logging.debug('CLADE: Starting hmmscans for %s files ' % len(fastaList)) # TODO UNCOMMENT THIS pool.map(runInstance, [ProgramRunner("HMMER_COMMAND", [ hmmer_db, os.path.join(fastaFilesDir,x), os.path.join(hmmerOutputDir,x.split(".")[0]) ] ) for x in fastaList]) logging.debug('CLADE: Done with hmmscans') #Parse HMMscan parsedHmmerOutputDir = os.path.join(os.path.dirname(args.inFile.name), "hmmer_parsedOutput") makeDirOrdie(parsedHmmerOutputDir) logging.debug('CLADE:Parsing Hmmer outputs for %s files ' % len(fastaList)) # making dirs in hmmer_parsedOutput with the sample names pool.map(makeDirOrdie, [ os.path.join(parsedHmmerOutputDir, x.split(".")[0]) for x in fastaList]) logging.debug('CLADE: Starting parsing for for %s files ' % len(fastaList)) samples = [sample.rstrip() for sample in open(args.samplesFile.name, 'r')] # TODO CHANGE THIS TO RUN ISNTANCE print samples print os.path.join(hmmerOutputDir, samples[0]+".out") print os.path.join(parsedHmmerOutputDir, samples[0]) print args.evalue #for sample in samples: # cp = CladeParser( os.path.join(hmmerOutputDir, sample+".out"), os.path.join(parsedHmmerOutputDir, sample), args.evalue) # runInstance(cp) pool.map(runInstance, [CladeParser( os.path.join(hmmerOutputDir, sample+".out"), os.path.join(parsedHmmerOutputDir, sample), args.evalue) for sample in samples]) logging.debug('CLADE:Done Parsing Hmmer outputs for %s files ' % len(fastaList)) # generate tables and pie-charts logging.debug("CLADE:Generating formatted output") makeCladeDistribTable( args.samplesFile.name, os.path.join(os.path.dirname(args.inFile.name),"hmmer_parsedOutput")) generateCladeBreakdown(args.samplesFile.name, os.path.join(os.path.dirname(args.inFile.name),"hmmer_parsedOutput"), "HIT", 4) logging.debug("CLADE:Done Generating formatted output") # Create the regular expressions to split the files splitFastaRegEx = os.path.join(fastaFilesDir, "%s"+".fasta") inFileRegEx = os.path.join(os.path.dirname(args.inFile.name),"hmmer_parsedOutput", "%s", "HIT") makeDirOrdie(os.path.join(os.path.dirname(args.inFile.name),"hmmer_hits")) outputFastaRegEx = os.path.join(os.path.dirname(args.inFile.name),"hmmer_hits", "%s"+".fasta") extractSeqsFromHits(splitFastaRegEx, inFileRegEx, outputFastaRegEx, 0, args.samplesFile.name, pool) logging.debug('Done with Clade run')
def maskGenome(args, pool): print args.input name = os.path.splitext(os.path.basename(args.input))[0] # Can eventually be parallelized using the pool of threads logging.debug("Starting the masking of input %s" % args.input) # run logging.debug("Computing frequencies for finding repeats") prog = ProgramRunner("build_lmer_table", [args.input, name + ".freqs"]) prog.run() logging.debug("Finding repeats in the %s" % args.input) prog = ProgramRunner("repeatScout", [args.input, name + "_repeats.fa", name + ".freqs"]) prog.run() # Filter the repeats that do not pass the requirements # for now, this only consists in dropping short reads < N logging.debug("Filetering repeats") repeats = SeqIO.parse(name + "_repeats.fa", 'fasta') filterReads = [] for read in repeats: if len(read.seq) >= args.min_length: filterReads.append(read) SeqIO.write(filterReads, open(name + "_repeats_filtered.fa", 'w'), 'fasta') if len(filterReads) >= 1: prog = ProgramRunner("bowtie-build", [args.input, name]) prog.run() prog = ProgramRunner("bowtie-align", [ name, name + "_repeats_filtered.fa", name + "_repeats_filtered.sam" ]) prog.run() maskFile(args, name + "_repeats_filtered.sam") else: logging.debug("No repeats found is %s" % args.input) logging.debug("Done masking input %s" % args.input)