예제 #1
0
파일: Seanome.py 프로젝트: mahdi-b/Seanome
def find_csr(args, pool):
    # TODOL
    # validate the params
    # throw error if program returns error

    tmpDir = tempfile.mkdtemp(prefix="seanome_", dir=".")
    nhmmer_dir = os.path.join(tmpDir, "nhmmer_dir")
    os.mkdir(nhmmer_dir)
    aliFiles = os.listdir(args.alignments_dir)
    logging.debug("Finding %s alignments from  %s in ref %s: \n" %
                  (len(aliFiles), args.alignments_dir, args.genome))
    pool.map(runInstance, [
        ProgramRunner("nhmmer", [
            os.path.join(args.alignments_dir, x), args.genome,
            os.path.join(nhmmer_dir,
                         x.split(".")[0])
        ]) for x in aliFiles
    ])

    logging.debug("Parsing outputs")
    nhmmerToAli = NHMMER_TO_ALI(args, args.alignments_dir, nhmmer_dir,
                                args.genome, args.fasta_output)
    nhmmerToAli.run()

    logging.debug("Aligning hits")
    #Run mutiple sequence alignment
    pool.map(runInstance, [
        ProgramRunner("muscle", [
            os.path.join(args.fasta_output, x),
            os.path.join(args.alis_output, x)
        ]) for x in aliFiles
    ])

    logging.debug("Finished finding common shared regions between %s and %s" %
                  (args.alignments_dir, args.genome))
예제 #2
0
파일: Seanome.py 프로젝트: mahdi-b/Seanome
def find_seed_csr(args, pool):

    outName = os.path.splitext(os.path.basename(args.input_1))[0]+"_"+os.path.splitext(os.path.basename(args.input_2))[0]+".lastz"
    logging.debug("Finding seed common shared regions between %s and %s: \n Starting... " % (args.input_1, args.input_2))
    prog = ProgramRunner("lastz", [args.input_1, args.input_2, args.min_csr_sim, outName])
    prog.run()
    generateAlis(args, outName)
    logging.debug("Finished finding seed common shared regions between %s and %s" % (args.input_1, args.input_2))
예제 #3
0
파일: Seanome.py 프로젝트: mahdi-b/Seanome
def find_seed_csr(args, pool):

    outName = os.path.splitext(os.path.basename(
        args.input_1))[0] + "_" + os.path.splitext(
            os.path.basename(args.input_2))[0] + ".lastz"
    logging.debug(
        "Finding seed common shared regions between %s and %s: \n Starting... "
        % (args.input_1, args.input_2))
    prog = ProgramRunner(
        "lastz", [args.input_1, args.input_2, args.min_csr_sim, outName])
    prog.run()
    generateAlis(args, outName)
    logging.debug(
        "Finished finding seed common shared regions between %s and %s" %
        (args.input_1, args.input_2))
예제 #4
0
파일: Seanome.py 프로젝트: mahdi-b/Seanome
def consensus(args, pool):
    aliFiles = os.listdir(args.alis_dir)
    pool.map(runInstance, [
        ProgramRunner("addConsensus", [
            os.path.join(args.alis_dir, x),
            os.path.join(args.cons_output, x)
        ]) for x in aliFiles
    ])
예제 #5
0
def resolveMultipleHits(args, pool):
   # Cluster each sample independently and then merges all representatives in allReps.fasta

   repsDir = "Reps"
   repsFasta = "allReps.fasta" # fasta file for all cluster respresentatives
   repsClustersDir= "Clusters" # directoy containing output of clustering
   correctedResultsDir = "correctedMultiplesHits"
   logging.debug("resolveMultipleHits: Started process for resolving multiple hits")

   samples = [sample.rstrip() for sample in open(args.samplesFile.name, 'r')]

   makeDirOrdie(os.path.join(args.clustersDir, "clusters",))

   
   
   pool.map(runInstance, [ProgramRunner("CLUSTER_COMMAND", [os.path.join(args.multipleFastaDir, sample+".fasta"), 
                                                            os.path.join(args.clustersDir, "clusters", sample)])  for sample in samples])
   logging.debug("resolveMultipleHits: Done clustering of multiple reads")

   # Directory where all cluster representatives will be set up    
   makeDirOrdie(os.path.join(args.clustersDir, repsDir))

   allRepsFasta = open(os.path.join(args.clustersDir, repsDir, repsFasta), 'w')

   # WARNING this shoud work fine as long as each clstr file not very large. 
   for sample in samples:
      with open(os.path.join(args.clustersDir, "clusters", sample)) as infile:
            allRepsFasta.write(infile.read())
   allRepsFasta.close()
   logging.debug("resolveMultipleHits: clustering concatenated all reps")

   #Now Cluster all the representatives
   makeDirOrdie(os.path.join(args.clustersDir, repsDir, repsClustersDir))
   runInstance(ProgramRunner("CLUSTER_COMMAND", [os.path.join(args.clustersDir, repsDir, repsFasta), os.path.join(args.clustersDir, repsDir, repsClustersDir, repsFasta)]))
   logging.debug("resolveMultipleHits: done clustering the reps %s" % os.path.join(args.clustersDir, repsDir, repsFasta))
   cdHitParser =  CD_HitParser("data/samples.ids", "data/resolveMultiples/Reps/Clusters/allReps.fasta.clstr", 
                               "data/resolveMultiples/clusters/", "data/blastResults/MULTIPLE/",)
   
   makeDirOrdie(os.path.join(args.clustersDir, correctedResultsDir))
   cdHitParser.run(os.path.join(args.clustersDir, correctedResultsDir)) 

   logging.debug("resolveMultipleHits: Finished resolving multiple hits")
예제 #6
0
def processSubtype(args, pool):
   logging.debug("SYBTYPE: Running blast for subtyping")
   samples = [sample.rstrip() for sample in open(args.samplesFile.name, 'r')]
   makeDirOrdie(args.blastOutDir)
   pool.map(runInstance, [ProgramRunner("BLAST_COMMAND", [os.path.join(args.hitsDir, sample+".fasta"), blast_db, os.path.join(args.blastOutDir, sample+".out") ] ) for sample in samples])
   logging.debug("SUBTYPE: Done running blast for subtyping")

   logging.debug("SUBTYPE: Parsing blast output files")
   makeDirOrdie(args.blastResults)
   makeDirOrdie(os.path.join(args.blastResults,"PERFECT"))
   makeDirOrdie(os.path.join(args.blastResults,"UNIQUE"))
   makeDirOrdie(os.path.join(args.blastResults,"MULTIPLE"))
   makeDirOrdie(os.path.join(args.blastResults,"SHORT"))
   makeDirOrdie(os.path.join(args.blastResults,"NEW"))
   makeDirOrdie(os.path.join(args.blastResults,"SHORTNEW"))

   pool.map(runInstance,  [BlastParser( os.path.join(args.blastOutDir, sample+".out"), args.blastResults) for sample in samples])
   logging.debug("SUBTYPE: Parsing blast output files")

   logging.debug("SUBTYPE:Generating formatted output")
   generateSubtypeCounts(args.samplesFile.name, args.blastResults, "UNIQUE")
   generateSubtypeCounts(args.samplesFile.name, args.blastResults, "PERFECT")
   # The SHORTNEW Has a different meaning. 
   # HIS does not show the distribution of subtype hits, but the closest one that these short new seqeunces are hitting
   # However the hits are not significant enough to assign them to them
   generateSubtypeCounts(args.samplesFile.name, args.blastResults, "SHORTNEW")
   logging.debug("SUBTYPE:Done Generating formatted output")

   logging.debug("SUBTYPE:Extracting multiple hits")
   splitFastaRegEx = os.path.join(args.fastaFilesDir, "%s"+".fasta")
   inFileRegEx = os.path.join(args.blastResults,"MULTIPLE", "%s"+".out")
   makeDirOrdie(os.path.join(args.blastResults,"MULTIPLE", "fasta"))
   outputFastaRegEx= os.path.join(args.blastResults,"MULTIPLE", "fasta", "%s"+".fasta")
   extractSeqsFromHits(splitFastaRegEx, inFileRegEx, outputFastaRegEx,  0, args.samplesFile.name, pool)
   logging.debug("SUBTYPE:Done Extracting multiple hits")
   logging.debug("SUBTYPE:Completed")
예제 #7
0
파일: Seanome.py 프로젝트: mahdi-b/Seanome
def maskGenome(args, pool):
    print args.input
    name =  os.path.splitext(os.path.basename(args.input))[0]    
    # Can eventually be parallelized using the pool of threads
    logging.debug("Starting the masking of input %s" % args.input)
    # run 
    logging.debug("Computing frequencies for finding repeats" )
    prog = ProgramRunner("build_lmer_table", [args.input, name+".freqs" ] )

    prog.run()
    logging.debug("Finding repeats in the %s" % args.input)
    prog = ProgramRunner("repeatScout", [args.input, name+"_repeats.fa", name+".freqs" ] )

    prog.run()
    # Filter the repeats that do not pass the requirements
    # for now, this only consists in dropping short reads < N
    logging.debug("Filetering repeats")
    repeats = SeqIO.parse(name+"_repeats.fa", 'fasta')
    filterReads=[]
    for read in repeats:
        if len(read.seq) >= args.min_length:
            filterReads.append(read)
    SeqIO.write(filterReads, open(name+"_repeats_filtered.fa", 'w'), 'fasta')
    if len(filterReads) >= 1:
        prog = ProgramRunner("bowtie-build", [args.input, name])
        prog.run()
        prog = ProgramRunner("bowtie-align", [name, name+"_repeats_filtered.fa", name+"_repeats_filtered.sam"])
        prog.run()
        maskFile(args, name+"_repeats_filtered.sam")
    else:
        logging.debug("No repeats found is %s" % args.input)
    logging.debug("Done masking input %s" % args.input)
예제 #8
0
def processClades(args, pool=Pool(processes=1)):

   logging.debug('CLADE:Processing caldes for: %s ' % args.inFile.name)
   # Split fasta file
   # Put all the directories is at the same level as the inFile.
   fastaFilesDir = os.path.join(os.path.dirname(args.inFile.name), "fasta")
   
   fastaList = Helpers.splitFileBySample(args.inFile.name, args.samplesFile.name, fastaFilesDir)

   # Running HMMscan
   hmmerOutputDir =  os.path.join(os.path.dirname(args.inFile.name), "hmmer_output")
   makeDirOrdie(hmmerOutputDir)   
   logging.debug('CLADE: Starting hmmscans for %s files ' % len(fastaList))
   # TODO UNCOMMENT THIS
   pool.map(runInstance, [ProgramRunner("HMMER_COMMAND", [ hmmer_db, os.path.join(fastaFilesDir,x), os.path.join(hmmerOutputDir,x.split(".")[0]) ] ) for x in fastaList])
   logging.debug('CLADE: Done with hmmscans')

   #Parse HMMscan

   parsedHmmerOutputDir = os.path.join(os.path.dirname(args.inFile.name), "hmmer_parsedOutput")   


   makeDirOrdie(parsedHmmerOutputDir)

   logging.debug('CLADE:Parsing Hmmer outputs for %s files ' % len(fastaList))

   # making dirs in hmmer_parsedOutput with the sample names
   

   pool.map(makeDirOrdie, [ os.path.join(parsedHmmerOutputDir, x.split(".")[0]) for x in fastaList])    


   logging.debug('CLADE: Starting parsing for  for %s files ' % len(fastaList))
   samples = [sample.rstrip() for sample  in open(args.samplesFile.name, 'r')]
   # TODO CHANGE THIS TO RUN ISNTANCE 

   print samples
   print os.path.join(hmmerOutputDir, samples[0]+".out")
   print os.path.join(parsedHmmerOutputDir, samples[0])
   print args.evalue
   
   #for sample in samples:
   #   cp = CladeParser( os.path.join(hmmerOutputDir, sample+".out"), os.path.join(parsedHmmerOutputDir, sample), args.evalue)
   #   runInstance(cp)

   pool.map(runInstance, [CladeParser( os.path.join(hmmerOutputDir, sample+".out"), os.path.join(parsedHmmerOutputDir, sample), args.evalue) for sample in samples])    

   logging.debug('CLADE:Done Parsing Hmmer outputs for %s files ' % len(fastaList))

   # generate tables and pie-charts
   logging.debug("CLADE:Generating formatted output")
   makeCladeDistribTable( args.samplesFile.name, os.path.join(os.path.dirname(args.inFile.name),"hmmer_parsedOutput"))   
   generateCladeBreakdown(args.samplesFile.name, os.path.join(os.path.dirname(args.inFile.name),"hmmer_parsedOutput"), "HIT", 4)
   logging.debug("CLADE:Done Generating formatted output")


   # Create the regular expressions to split the files
   splitFastaRegEx = os.path.join(fastaFilesDir, "%s"+".fasta")
   inFileRegEx = os.path.join(os.path.dirname(args.inFile.name),"hmmer_parsedOutput", "%s", "HIT") 
   makeDirOrdie(os.path.join(os.path.dirname(args.inFile.name),"hmmer_hits"))
   outputFastaRegEx = os.path.join(os.path.dirname(args.inFile.name),"hmmer_hits", "%s"+".fasta")

   extractSeqsFromHits(splitFastaRegEx, inFileRegEx, outputFastaRegEx,  0, args.samplesFile.name, pool)

   logging.debug('Done with Clade run')
예제 #9
0
파일: Seanome.py 프로젝트: mahdi-b/Seanome
def maskGenome(args, pool):
    print args.input
    name = os.path.splitext(os.path.basename(args.input))[0]
    # Can eventually be parallelized using the pool of threads
    logging.debug("Starting the masking of input %s" % args.input)
    # run
    logging.debug("Computing frequencies for finding repeats")
    prog = ProgramRunner("build_lmer_table", [args.input, name + ".freqs"])

    prog.run()
    logging.debug("Finding repeats in the %s" % args.input)
    prog = ProgramRunner("repeatScout",
                         [args.input, name + "_repeats.fa", name + ".freqs"])

    prog.run()
    # Filter the repeats that do not pass the requirements
    # for now, this only consists in dropping short reads < N
    logging.debug("Filetering repeats")
    repeats = SeqIO.parse(name + "_repeats.fa", 'fasta')
    filterReads = []
    for read in repeats:
        if len(read.seq) >= args.min_length:
            filterReads.append(read)
    SeqIO.write(filterReads, open(name + "_repeats_filtered.fa", 'w'), 'fasta')
    if len(filterReads) >= 1:
        prog = ProgramRunner("bowtie-build", [args.input, name])
        prog.run()
        prog = ProgramRunner("bowtie-align", [
            name, name + "_repeats_filtered.fa", name + "_repeats_filtered.sam"
        ])
        prog.run()
        maskFile(args, name + "_repeats_filtered.sam")
    else:
        logging.debug("No repeats found is %s" % args.input)
    logging.debug("Done masking input %s" % args.input)