Ejemplo n.º 1
0
def build_profile(input,output_directory):  
  global taxon_map,level_map,key_map,levels
  temp_dir=tempfile.mkdtemp(dir=options().__getattribute__('tempdir'))
  binned_fragments=bin_to_markers(input,temp_dir)    
  
  #load up taxonomy for 30 marker genes
  (taxon_map, level_map, key_map) = load_taxonomy(options().__getattribute__('reference').path + 'refpkg/rpsB.refpkg/all_taxon.taxonomy')
    
  #all classifications stored here  
  classifications = {}

  #Now run TIPP on each fragment    
  for (gene,frags) in binned_fragments.items():    
    #Get size of each marker
    total_taxa = 0
    with open(options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.size'%gene, 'r') as f:
      total_taxa = int(f.readline().strip())
    decomp_size = options().alignment_size
    if (decomp_size > total_taxa):
      decomp_size = int(total_taxa/2)
    cpus = options().cpu
    if (len(frags.keys()) < cpus):
      cpus = len(frags.keys())
    os.system('run_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a %s -r %s -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s -o %s -d %s' % (options().config_file.name, cpus, options().molecule, temp_dir+"/%s.frags.fas.fixed" % gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.taxonomy'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.tree'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.fasta'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.taxonomy.RAxML_info'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/all_taxon.taxonomy'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/species.mapping'%gene,options().alignment_threshold,options().placement_threshold,decomp_size,total_taxa,temp_dir+"/temp_file","tipp_%s" % gene,output_directory+"/markers/"))
    if (not os.path.exists(output_directory+"/markers/tipp_%s_classification.txt" % gene)):
      continue

    gene_classification = generate_classification(output_directory+"/markers/tipp_%s_classification.txt" % gene,0)

    #Now write individual classification and also pool classifications    
    write_classification(gene_classification, output_directory+"/markers/tipp_%s.classification.0" % gene)    
    classifications.update(gene_classification)    
  remove_unclassified_level(classifications)
  write_classification(classifications, output_directory+"/markers/all.classification.0")
  write_abundance(classifications,output_directory)
Ejemplo n.º 2
0
def hmmer_search(input, hmmer, output):
    """Blast the fragments against all marker genes+16S sequences, return output
  """
    os.system(
        "%s --noali -E 10000 --cpu %d -o %s %s %s"
        % (options().__getattribute__("hmmsearch").path, options().cpu, output, hmmer, input)
    )
Ejemplo n.º 3
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     if (options().backbone_size is None):            
         options().backbone_size = min(100,int(.20*sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
     
     _LOG.info("Writing query and backbone set. ")
     query = get_temp_file("query", "backbone", ".fas")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(sequences, query)
     _write_fasta(backbone_sequences, backbone)
             
     _LOG.info("Generating sate backbone alignment and tree. ")
     satealignJob = SateAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     satealignJob.run()
     satealignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/sate.fasta")
     options().tree_file = open(self.options.outdir + "/sate.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     options().fragment_file = query
Ejemplo n.º 4
0
 def check_options(self, supply=[]):
     if (options().reference_pkg is not None):
         self.load_reference(os.path.join(options().reference.path, 'refpkg/%s.refpkg/' % options().reference_pkg))                  
     if (options().taxonomy_file is None):
         supply = supply + ["taxonomy file"]
     if (options().taxonomy_name_mapping_file is None):
         supply = supply + ["taxonomy name mapping file"]
     ExhaustiveAlgorithm.check_options(self, supply)
Ejemplo n.º 5
0
def main():
    augment_parser()
    sepp.config._options_singelton = sepp.config._parse_options()
    if (options().alignment_size is None):
        options().alignment_size = 100
    input = options().fragment_file.name
    output_directory = options().outdir
    build_profile(input, output_directory)
Ejemplo n.º 6
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        if options().tree_file is None or options().alignment_file is None:
            _LOG.error("Specify the backbone alignment and tree and query sequences")
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        return ExhaustiveAlgorithm.check_options(self)
Ejemplo n.º 7
0
 def load_reference(self, reference_pkg):
     file = open(reference_pkg + 'CONTENTS.json')
     result=json.load(file)
     file.close()
     options().taxonomy_name_mapping_file = open(reference_pkg + result['files']['seq_info'])
     options().taxonomy_file = open(reference_pkg + result['files']['taxonomy'])
     options().alignment_file = open(reference_pkg + result['files']['aln_fasta'])
     options().tree_file = open(reference_pkg + result['files']['tree'])
     options().info_file = reference_pkg + result['files']['tree_stats']
Ejemplo n.º 8
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name+'_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir+"/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])
    gene_set = marker_genes
    align_name = 'sate'
    if (options().genes == 'cogs'):
        gene_set = cog_genes
        align_name = 'pasta'
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__('reference').path,
                'refpkg/%s.refpkg/%.profile' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene, seq in genes.items():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(seq, gene_file + ".fixed")
    return genes
Ejemplo n.º 9
0
 def __init__(self):
     AbstractAlgorithm.__init__(self)
     self.place_nomatch_fragments = False
     ''' Hardcoded E-Lim for hmmsearch ''' #TODO: what to do with this
     self.elim = 99999999
     self.filters = False
     self.strategy = options().exhaustive.strategy
     self.minsubsetsize = int(options().exhaustive.minsubsetsize)
     #Temp fix for now, 
     self.molecule = self.options.molecule
Ejemplo n.º 10
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, "NA", "NA"]) for name in fragments.keys()])
    gene_set = marker_genes
    align_name = "sate"
    if options().genes == "cogs":
        gene_set = cog_genes
        align_name = "pasta"
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__("reference").path, "refpkg/%s.refpkg/%.profile" % (gene, align_name)
            ),
            temp_dir + "/%s.out" % gene,
        )
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name in results.keys():
            bitscore = results[name][1]
            direction = "forward"
            true_name = name
            if name.find("_rev") != -1:
                true_name = true_name.replace("_rev", "")
                direction = "reverse"
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name in frag_scores.keys():
        if frag_scores[name][1] not in genes:
            genes[frag_scores[name][1]] = {}
        if frag_scores[name][2] == "forward":
            genes[frag_scores[name][1]][name] = fragments[name]
        else:
            genes[frag_scores[name][1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene in genes.keys():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(genes[gene], gene_file + ".fixed")
    return genes
Ejemplo n.º 11
0
    def testConfigFile(self):
        # Just to make different test cases independent of each other.
        config._options_singelton = None
        # Diasable main config path for this test
        config.main_config_path = self.fp_config

        sys.argv = [
            sys.argv[0], "-A", "2",
            "-c", get_data_path("configs/test.config"),
            "--outdir", "dir_form_commandline"]

        assert options().alignment_size == 2, \
            "Commandline option -A not read properly"

        assert isinstance(options().config_file, filetypes) and \
            options().config_file.name.endswith("data/configs/test.config"), \
            "Commandline option -c not read properly"

        assert (options().pplacer is not None and
                options().pplacer.path == "pplacer"), \
            "config file options not read properly"

        assert options().placement_size == 10, \
            "Config file option placementSize not read properly"

        assert options().outdir.endswith("dir_form_commandline"), \
            "Config file value outdir is not properly overwritten:%s " % \
            options().outdir

        assert options().tempdir is not None, \
            "Default value not properly set for tempfile attribute"
Ejemplo n.º 12
0
 def testCpuCount(self):
     config._options_singelton = None # Just to make different test cases independent of each other.
     back = config.main_config_path
     config.main_config_path = os.path.expanduser("~/.sepp/main.config.notexistentfile") # Diasable main config path for this test
     sys.argv = [sys.argv[0], "-x" ,"7"]
       
     assert options().cpu == 7, "Commandline option -x not read properly"
     
     print options()
     
     config.main_config_path = back        
Ejemplo n.º 13
0
def blast_fragments(input, output):
    '''Blast the fragments against all marker genes+16S sequences, return
    output'''
    os.system(
        ('%s -db %s -outfmt 6 -query %s -out %s -num_threads %d '
         '-max_target_seqs 1 ') %
        (options().__getattribute__('blast').path,
         os.path.join(
            options().__getattribute__('reference').path,
            "blast/%s/alignment.fasta.db" % options().genes),
         input, output, options().cpu))
Ejemplo n.º 14
0
def blast_fragments(input, output):
    """Blast the fragments against all marker genes+16S sequences, return output
  """
    os.system(
        "%s -db %s -outfmt 6 -query %s -out %s -num_threads %d -max_target_seqs 1 "
        % (
            options().__getattribute__("blast").path,
            os.path.join(options().__getattribute__("reference").path, "blast/%s/alignment.fasta.db" % options().genes),
            input,
            output,
            options().cpu,
        )
    )
Ejemplo n.º 15
0
 def __init__(self, **kwargs):
     self.job_type = 'jsonmerger'
     ExternalSeppJob.__init__(self, self.job_type, **kwargs)
     self.out_file = None
     self.distribution = False
     self.taxonomy = None
     self.mapping = None
     self.threshold = None
     self.classification_file = None
     self.elim = float(options().hmmsearch.elim)
     if options().hmmsearch.filters.upper() == "TRUE":
         self.filters = True
     else:
         if options().hmmsearch.filters.upper() == "FALSE":
             self.filters = False
         else:
             self.filters = None
     if self.filters is None:
         raise Exception(
             "Expecting true/false for options().hmmsearch.filters")
     self.strategy = options().exhaustive.strategy
     self.minsubsetsize = int(options().exhaustive.minsubsetsize)
     self.alignment_threshold = float(options().alignment_threshold)
     self.molecule = options().molecule
     self.placer = options().exhaustive.__dict__['placer'].lower()
     self.cutoff = 0
Ejemplo n.º 16
0
    def run(self):
        checkpoint_manager = options().checkpoint
        assert isinstance(checkpoint_manager, CheckPointManager)

        t = time.time()

        if checkpoint_manager.is_recovering:
            checkpoint_manager.restore_checkpoint()
            self.root_problem = \
                checkpoint_manager.checkpoint_state.root_problem
            self.check_outputprefix()
        else:
            '''check input arguments'''
            self.check_options()

            '''build the problem structure'''
            self.root_problem = self.build_subproblems()

            '''build jobs'''
            self.build_jobs()

        '''connect jobs into a DAG'''
        self.connect_jobs()

        '''Queue up first level jobs (i.e. those with no dependency).
        Once these run, they should automatically enqueue the rest of the
        DAG through joins and callbacks '''
        self.enqueue_firstlevel_job()

        '''start the checkpointing (has any effects only in
           checkpointing mode)'''
        checkpoint_manager.start_checkpointing(self.root_problem)

        '''Wait for all jobs to finish'''
        if (not JobPool().wait_for_all_jobs()):
            _LOG.exception(
                "There have been errors in executed jobs. Terminating.")
            sys.exit(1)

        ''' terminate The job pool and release memory'''
        JobPool().terminate()

        ''' Pause Checkpointing'''
        checkpoint_manager.pause_checkpointing()
        # checkpoint_manager.force_checkpoint()

        '''Merge results into final outputs'''
        self.merge_results()

        '''Output final results'''
        self.output_results()

        ''' Pause Checkpointing'''
        checkpoint_manager.stop_checkpointing()

        _LOG.info("Current execution Finished in %d seconds"
                  % (time.time() - t))
        _LOG.info(
            "All checkpointed executions Finished in %d cumulative time" %
            (checkpoint_manager.get_total_time()))
Ejemplo n.º 17
0
 def __init__(self):
     '''
     Constructor
     '''
     self.root_problem = None
     self.results = None
     self.options = options()
     self.outchecked = False  # for ease of access
Ejemplo n.º 18
0
 def start_checkpointing(self, root_problem):
     if self.is_checkpointing:
         _LOG.info("Checkpoint every %d seconds" %options().checkpoint_interval)
         self.checkpoint_state.root_problem = root_problem 
         self.checkpoint_state.temp_root = get_root_temp_dir()
         if self.checkpoint_state.cumulative_time is None:
             self.checkpoint_state.cumulative_time = 0
         save_checkpoint(self)
Ejemplo n.º 19
0
def bin_blast_results(input):
    # Map the blast results to the markers
    gene_mapping = read_mapping(
        os.path.join(
            options().__getattribute__('reference').path,
            'blast/%s/seq2marker.tab' % options().genes))

    genes = {}
    with open(input) as f:
        for line in f:
            results = line.split('\t')
            gene = gene_mapping[results[1]][1]
            if gene in genes:
                genes[gene].append(results[0])
            else:
                genes[gene] = [results[0]]
    return genes
Ejemplo n.º 20
0
 def __init__(self):
     '''
     Constructor
     '''
     self.root_problem = None
     self.results = None
     self.options = options() # for ease of access
     pass
Ejemplo n.º 21
0
def bin_to_markers(input,temp_dir):
  fragments = MutableAlignment()
  fragments.read_filepath(input)

  if (options().gene == None):    
    #First blast sequences against all markers    
    blast_results=temp_dir+"/blast.out"
    print "Blasting fragments against marker dataset\n"
    blast_fragments(input,blast_results)
    
    #Next bin the blast hits to the best gene
    gene_binning = bin_blast_results(blast_results)
  else:
    gene_binning = {options().gene:fragments.keys()}
  #Now figure out direction of fragments
  binned_fragments = dict([(gene,dict([(seq_name,fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning])

  for (gene,frags) in binned_fragments.items():
    #Add reverse complement sequence
    frags_rev = dict([(name+'_rev',reverse_sequence(seq)) for (name,seq) in frags.items()])
    gene_frags = MutableAlignment()
    gene_frags.set_alignment(frags)
    gene_frags.set_alignment(frags_rev)
    gene_file=temp_dir+"/%s.frags.fas" % gene
    _write_fasta(gene_frags,gene_file)
    
    #Now run HMMER search
    hmmer_search(gene_file,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.profile'%gene,temp_dir+"/%s.out" % gene)
    results=read_hmmsearch_results(temp_dir+"/%s.out" % gene)
    
    #Now select best direction for each frag
    for key in frags:
      forward_score = -10000
      backward_score = -10000
      if (key in results):
        forward_score = results[key][1]
      if (key+"_rev" in results):
        backward_score = results[key+"_rev"][1]
      if (backward_score > forward_score):
        frags[key]=gene_frags[key+"_rev"]
    
    #Now write to file
    _write_fasta(frags,gene_file+".fixed")
    binned_fragments[gene]=frags
  return binned_fragments  
Ejemplo n.º 22
0
    def testCpuCount(self):
        # Just to make different test cases independent of each other.
        config._options_singelton = None
        # Disable main config path for this test
        config.main_config_path = self.fp_config
        JobPool().terminate()
        JobPool().__init__(7)
        sys.argv = [sys.argv[0], "-x", "7"]

        assert options().cpu == 7, "Commandline option -x not read properly"

        # clean up after test:
        # 1) the JobPool CPU counts needs to be reset to the default
        # 2) the command line arguments must be restored
        JobPool().terminate()
        JobPool().__init__(cpu_count())
        sys.argv = [sys.argv[0], "-x", str(cpu_count())]
        config._options_singelton = None
        options()
Ejemplo n.º 23
0
 def figureout_fragment_subset(self):
     ''' Figure out which fragment should go to which subproblem'''
     # We need to keep and check the following flag because of checkpoining scenarios (join already done before!)
     if "fragments.distribution.done" in self.root_problem.annotations:
         return
     bitscores = dict([(name, []) for name in list(self.root_problem.fragments.keys())])
     for fragment_chunk_problem in self.root_problem.iter_leaves():
         align_problem = fragment_chunk_problem.get_parent()
         assert isinstance(align_problem, SeppProblem)
         '''For each subproblem start with an empty set of fragments,
         and add to them as we encounter new best hits for that subproblem'''
         if align_problem.fragments is None:
             align_problem.fragments = MutableAlignment()
         search_res = fragment_chunk_problem.get_job_result_by_name("hmmsearch")
         for key in list(search_res.keys()):
             ''' keep a list of all hits, and their bit scores'''
             bitscores[key].append( (search_res[key][1], align_problem) )
             
     for frag, tuplelist in bitscores.items():
         ''' TODO: what to do with those that are not? For now, only output warning message'''
         #TODO:  Need to double check and fix the math
         _LOG.warning("Fragment %s is not scored against any subset" %str(frag))
         if len(tuplelist) == 0:
             _LOG.warning("Fragment %s is not scored against any subset" %str(frag))
             continue
         ''' convert bit scores to probabilities '''            
         denum = sum(math.pow(2, min(x[0],1022)) for x in tuplelist)
         #_LOG.warning("Tuples: %s" %str(tuplelist))
         tuplelist = [((math.pow(2,min(x[0],1022))/denum*1000000),x[1]) for x in tuplelist]
         ''' Sort subsets by their probability'''
         tuplelist.sort(reverse=True, key = lambda x: x[0])
         ''' Find enough subsets to reach the threshold '''
         selected = tuplelist[ 0 : max(1,
             reduce(lambda x, y: (x[0],None) if x[1] is None else
                                 (y[0],x[1]+y[1]) if x[1] < int(1000000 * self.alignment_threshold) else
                                 (y[0],None),
                    enumerate([x[0] for x in tuplelist]))[0]) ]
         
         ''' Renormalize the selected list to add up to 1'''
         renorm = 0
         for (prob,align_problem) in selected:	      
           renorm = renorm + prob/1000000
         renorm = 1/renorm
         
         _LOG.debug("Fragment %s assigned to %d subsets" %(frag,len(selected)))
         ''' Rename the fragment and assign it to the respective subsets'''
         for (prob,align_problem) in selected:
             postfix = prob*renorm if options().exhaustive.weight_placement_by_alignment.lower() == "true" else 1000000
             frag_rename = "%s_%s_%d" %(frag,align_problem.label,postfix)
             align_problem.fragments[frag_rename] = self.root_problem.fragments[frag]
     
     self.root_problem.annotations["fragments.distribution.done"] = 1
Ejemplo n.º 24
0
    def testConfigFile(self):
        config._options_singelton = None # Just to make different test cases independent of each other.
        back = config.main_config_path
        config.main_config_path = os.path.expanduser("~/.sepp/main.config.notexistentfile") # Diasable main config path for this test
        sys.argv = [sys.argv[0], "-A" ,"2", "-c" ,"data/configs/test.config", "--outdir", "dir_form_commandline"]
          
        assert options().alignment_size == 2, "Commandline option -A not read properly"
        
        assert isinstance(options().config_file, file) and options().config_file.name == "data/configs/test.config", "Commandline option -c not read properly"

        assert (options().pplacer is not None 
                and options().pplacer.path == "pplacer"), "config file options not read properly"
                            
        assert options().placement_size == 10, "Config file option placementSize not read properly"                    
        
        assert options().outdir.endswith("dir_form_commandline"), "Config file value outdir is not properly overwritten:%s " %options().outdir
        
        assert options().tempdir is not None, "Default value not properly set for tempfile attribute"
        
        print options()
        
        config.main_config_path = back
Ejemplo n.º 25
0
 def check_options(self, supply=[]):
     '''
     This method should check the input values stored in config.option to make
     sure every necessary argument is provided, and that the provided values
     are all fine. 
     
     In the event of recognizing invalid or missing input, an Exception
     (maybe an ArgumentError) should be raised. 
     
     By default expects tree_file, raxml_file, and fragment_file. Overwrite if required. 
     '''
     if (options().tree_file is None):
         supply = supply + ["tree file"]
     if (options().alignment_file is None):
         supply = supply + ["alignment file"]
     if (options().fragment_file is None):
         supply = supply + ["fragment file"]
     if (len(supply) != 0):
         raise ValueError(
             "Failed to supply: %s\nRun with -h option to see a list of options."
             % " , ".join(supply))
     self.check_outputprefix()
Ejemplo n.º 26
0
def blast_to_markers(input, temp_dir):
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    if (options().gene is None):
        # First blast sequences against all markers
        blast_results = temp_dir + "/blast.out"
        if (options().blast_file is None):
            print("Blasting fragments against marker dataset\n")
            blast_fragments(input, blast_results)
        else:
            blast_results = options().blast_file
        # Next bin the blast hits to the best gene
        gene_binning = bin_blast_results(blast_results)
    else:
        gene_binning = {options().gene: list(fragments.keys())}
    # Now figure out direction of fragments
    binned_fragments = dict([(gene,
                              dict([(seq_name, fragments[seq_name])
                                    for seq_name in gene_binning[gene]]))
                             for gene in gene_binning])
    print("Finding best orientation of reads\n")
    align_name = 'sate'
    if (options().genes == 'cogs'):
        align_name = 'pasta'
    for (gene, frags) in binned_fragments.items():
        # Add reverse complement sequence
        frags_rev = dict([(name + '_rev', reverse_sequence(seq))
                          for (name, seq) in frags.items()])
        gene_frags = MutableAlignment()
        gene_frags.set_alignment(frags)
        gene_frags.set_alignment(frags_rev)
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(gene_frags, gene_file)

        # Now run HMMER search
        hmmer_search(
            gene_file,
            os.path.join(options().__getattribute__('reference').path,
                         'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for key in frags:
            forward_score = -10000
            backward_score = -10000
            if (key in results):
                forward_score = results[key][1]
            if (key + "_rev" in results):
                backward_score = results[key + "_rev"][1]
            if (backward_score > forward_score):
                frags[key] = gene_frags[key + "_rev"]

        # Now write to file
        _write_fasta(frags, gene_file + ".fixed")
        binned_fragments[gene] = frags
    return binned_fragments
Ejemplo n.º 27
0
 def figureout_fragment_subset(self):
     ''' Figure out which fragment should go to which subproblem'''
     # We need to keep and check the following flag because of checkpoining scenarios (join already done before!)
     if self.root_problem.annotations.has_key("fragments.distribution.done"):
         return
     bitscores = dict([(name, []) for name in self.root_problem.fragments.keys()])
     for fragment_chunk_problem in self.root_problem.iter_leaves():
         align_problem = fragment_chunk_problem.get_parent()
         assert isinstance(align_problem, SeppProblem)
         '''For each subproblem start with an empty set of fragments,
         and add to them as we encounter new best hits for that subproblem'''
         if align_problem.fragments is None:
             align_problem.fragments = MutableAlignment()
         search_res = fragment_chunk_problem.get_job_result_by_name("hmmsearch")
         for key in search_res.keys():
             ''' keep a list of all hits, and their bit scores'''
             bitscores[key].append( (search_res[key][1], align_problem) )
             
     for frag, tuplelist in bitscores.iteritems():
         ''' TODO: what to do with those that are not? For now, only output warning message'''
         #TODO:  Need to double check and fix the math
         if len(tuplelist) == 0:
             _LOG.warning("Fragment %s is not scored against any subset" %str(frag))
             continue
         ''' convert bit scores to probabilities '''            
         denum = sum(math.pow(2, min(x[0],1022)) for x in tuplelist)
         tuplelist = [((math.pow(2,min(x[0],1022))/denum*1000000),x[1]) for x in tuplelist]
         ''' Sort subsets by their probability'''
         tuplelist.sort(reverse=True)
         ''' Find enough subsets to reach the threshold '''
         selected = tuplelist[ 0 : max(1,
             reduce(lambda x, y: (x[0],None) if x[1] is None else
                                 (y[0],x[1]+y[1]) if x[1] < int(1000000 * self.alignment_threshold) else
                                 (y[0],None),
                    enumerate([x[0] for x in tuplelist]))[0]) ]
         
         ''' Renormalize the selected list to add up to 1'''
         renorm = 0
         for (prob,align_problem) in selected:	      
           renorm = renorm + prob/1000000
         renorm = 1/renorm
         
         _LOG.debug("Fragment %s assigned to %d subsets" %(frag,len(selected)))
         ''' Rename the fragment and assign it to the respective subsets'''
         for (prob,align_problem) in selected:
             postfix = prob*renorm if options().exhaustive.weight_placement_by_alignment.lower() == "true" else 1000000
             frag_rename = "%s_%s_%d" %(frag,align_problem.label,postfix)
             align_problem.fragments[frag_rename] = self.root_problem.fragments[frag]
     
     self.root_problem.annotations["fragments.distribution.done"] = 1
Ejemplo n.º 28
0
    def testConfigFileMissingFile(self):
        
        config._options_singelton = None # Just to make different test cases independent of each other.
        back = config.main_config_path 
        config.main_config_path = os.path.expanduser("~/.sepp/main.config.notexistentfile") # Diasable main config path for this test
         
        sys.argv = [sys.argv[0], "-c" ,"data/configs/test2.config", "-f", "data/simulated/test.fas"]
        assert isinstance(options().config_file, file) and options().config_file.name == "data/configs/test2.config", "Commandline option -c not read properly"

        assert isinstance(options().alignment_file, file) and options().alignment_file.name == "data/simulated/test.small.fas", "Config file option alignment not read properly"

        assert isinstance(options().fragment_file, file) and options().fragment_file.name == "data/simulated/test.fas", "Command-line option -f alignment not read properly"
        
        print options()       
        
        config.main_config_path = back             
Ejemplo n.º 29
0
def bin_blast_results(input):
  #Map the blast results to the markers
  gene_mapping = read_mapping(os.path.join(options().__getattribute__('reference').path, 'blast/%s/seq2marker.tab' % options().genes))
  
  genes = {}
  with open(input) as f:
    for line in f:
      results = line.split('\t')        
      gene = gene_mapping[results[1]][1];  
      if gene in genes:
        genes[gene].append(results[0])
      else:
        genes[gene] = [results[0]]
  return genes
Ejemplo n.º 30
0
def bin_blast_results(input):
    global refpkg

    # Map the blast results to the markers
    gene_mapping = read_mapping(refpkg["blast"]["seq-to-marker-map"])

    hitinfo = {}

    with open(input) as f:
        # BLAST output contains reads sorted in ascending order by bitscore
        for line in f:
            results = line.split('\t')

            qseqid = results[0]
            sseqid = results[1]
            # pident = float(results[2])
            # length = int(results[3])
            # mismatch = int(results[4])
            # gapopen = int(results[5])
            qstart = int(results[6])
            qend = int(results[7])
            qlen = int(results[8])
            sstart = int(results[9])
            send = int(results[10])
            slen = int(results[11])
            # evalue = float(results[12])
            # bitscore = float(results[13].strip())
            qcov = abs(qend - qstart) + 1

            update = False
            if qcov >= options().blast_threshold:
                try:
                    if hitinfo[qseqid]["qcov"] < qcov:
                        update = True
                except KeyError:
                    hitinfo[qseqid] = {}
                    update = True

            if update:
                hitinfo[qseqid]["sseqid"] = sseqid
                hitinfo[qseqid]["gene"] = gene_mapping[sseqid][1]
                hitinfo[qseqid]["qstart"] = qstart
                hitinfo[qseqid]["qend"] = qend
                hitinfo[qseqid]["qlen"] = qlen
                hitinfo[qseqid]["sstart"] = sstart
                hitinfo[qseqid]["send"] = send
                hitinfo[qseqid]["slen"] = slen
                hitinfo[qseqid]["qcov"] = qcov

    return hitinfo
Ejemplo n.º 31
0
    def run(self):
        checkpoint_manager = options().checkpoint
        assert isinstance(checkpoint_manager, CheckPointManager)

        t = time.time()

        if checkpoint_manager.is_recovering:
            checkpoint_manager.restore_checkpoint()
            self.root_problem = \
                checkpoint_manager.checkpoint_state.root_problem
            self.check_outputprefix()
        else:
            '''check input arguments'''
            self.check_options()
            '''build the problem structure'''
            self.root_problem = self.build_subproblems()
            '''build jobs'''
            self.build_jobs()
        '''connect jobs into a DAG'''
        self.connect_jobs()
        '''Queue up first level jobs (i.e. those with no dependency).
        Once these run, they should automatically enqueue the rest of the
        DAG through joins and callbacks '''
        self.enqueue_firstlevel_job()
        '''start the checkpointing (has any effects only in
           checkpointing mode)'''
        checkpoint_manager.start_checkpointing(self.root_problem)
        '''Wait for all jobs to finish'''
        if (not JobPool().wait_for_all_jobs()):
            _LOG.exception(
                "There have been errors in executed jobs. Terminating.")
            sys.exit(1)
        ''' terminate The job pool and release memory'''
        JobPool().terminate()
        ''' Pause Checkpointing'''
        checkpoint_manager.pause_checkpointing()
        # checkpoint_manager.force_checkpoint()
        '''Merge results into final outputs'''
        self.merge_results()
        '''Output final results'''
        self.output_results()
        ''' Pause Checkpointing'''
        checkpoint_manager.stop_checkpointing()

        _LOG.info("Current execution Finished in %d seconds" %
                  (time.time() - t))
        _LOG.info(
            "All checkpointed executions Finished in %d cumulative time" %
            (checkpoint_manager.get_total_time()))
Ejemplo n.º 32
0
 def check_options(self, supply=[]):
     '''
     This method should check the input values stored in config.option to make
     sure every necessary argument is provided, and that the provided values
     are all fine. 
     
     In the event of recognizing invalid or missing input, a an Exception
     (maybe an ArgumentError) should be raised. 
     
     By default expects tree_file, raxml_file, and fragment_file. Overwrite if required. 
     '''        
     if (options().tree_file is None):
         supply = supply + ["tree file"]
     if (options().alignment_file is None):
         supply = supply + ["alignment file"]
     if (options().fragment_file is None):
         supply = supply + ["fragment file"]
     if (len(supply) != 0):
         raise ValueError ("Failed to supply: %s\nRun with -h option to see a list of options." % " , ".join(supply))
     if (options().info_file is None):
         supply = supply + ["raxml file"];
         
     self.check_outputprefix()
     pass
Ejemplo n.º 33
0
def blast_to_markers(input, temp_dir):
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    if options().gene == None:
        # First blast sequences against all markers
        blast_results = temp_dir + "/blast.out"
        if options().blast_file == None:
            print "Blasting fragments against marker dataset\n"
            blast_fragments(input, blast_results)
        else:
            blast_results = options().blast_file
        # Next bin the blast hits to the best gene
        gene_binning = bin_blast_results(blast_results)
    else:
        gene_binning = {options().gene: fragments.keys()}
    # Now figure out direction of fragments
    binned_fragments = dict(
        [(gene, dict([(seq_name, fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning]
    )
    print "Finding best orientation of reads\n"
    align_name = "sate"
    if options().genes == "cogs":
        align_name = "pasta"
    for (gene, frags) in binned_fragments.items():
        # Add reverse complement sequence
        frags_rev = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in frags.items()])
        gene_frags = MutableAlignment()
        gene_frags.set_alignment(frags)
        gene_frags.set_alignment(frags_rev)
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(gene_frags, gene_file)

        # Now run HMMER search
        hmmer_search(
            gene_file,
            os.path.join(options().__getattribute__("reference").path, "refpkg/%s.refpkg/%s.hmm" % (gene, align_name)),
            temp_dir + "/%s.out" % gene,
        )
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for key in frags:
            forward_score = -10000
            backward_score = -10000
            if key in results:
                forward_score = results[key][1]
            if key + "_rev" in results:
                backward_score = results[key + "_rev"][1]
            if backward_score > forward_score:
                frags[key] = gene_frags[key + "_rev"]

        # Now write to file
        _write_fasta(frags, gene_file + ".fixed")
        binned_fragments[gene] = frags
    return binned_fragments
Ejemplo n.º 34
0
 def __init__(self, **kwargs):
     self.job_type = 'jsonmerger'
     ExternalSeppJob.__init__(self, self.job_type, **kwargs)
     self.out_file = None
     self.taxonomy = None
     self.mapping = None
     self.threshold = None
     self.classification_file = None
     self.elim = float(options().hmmsearch.elim)
     self.filters = True if options().hmmsearch.filters.upper(
     ) == "TRUE" else False if options().hmmsearch.filters.upper(
     ) == "FALSE" else None
     if self.filters is None:
         raise Exception(
             "Expecting true/false for options().hmmsearch.filters")
     self.strategy = options().exhaustive.strategy
     self.minsubsetsize = int(options().exhaustive.minsubsetsize)
     self.alignment_threshold = float(options().alignment_threshold)
     #Temp fix for now,
     self.molecule = options().molecule
     self.placer = options().exhaustive.__dict__['placer'].lower()
Ejemplo n.º 35
0
def save_checkpoint(checkpoint_manager):
    '''
    This is the callback function that is called periodically to save the
    current state of the system.
    '''
    # Note: this module is not bullet proof in terms of race conditions.
    # Most importantly, it is possible (though extremely unlikely) that
    # while the new temp path is being written (f.write...)
    if checkpoint_manager.is_checkpointing:
        # checkpoint_manager.lock.acquire()
        checkpoint_manager.saving = True
        newTmpDest = get_temp_file("dump", "checkpoints")
        _LOG.info("Checkpoint is being updated: %s" % newTmpDest)
        oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines()
        oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split(
            ",")[0]

        checkpoint_manager.update_time()

        currenlimit = sys.getrecursionlimit()
        sys.setrecursionlimit(100000)
        picklefile = gzip.GzipFile(newTmpDest, 'wb')
        pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2)
        picklefile.close()
        sys.setrecursionlimit(currenlimit)

        f = open(checkpoint_manager.checkpoint_path, "a")
        f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now()))
        f.close()
        if oldTmpFile is not None:
            os.remove(oldTmpFile)
        _LOG.info("Checkpoint Saved to: %s and linked in %s." %
                  (newTmpDest, checkpoint_manager.checkpoint_path))
        checkpoint_manager.saving = False
        # checkpoint_manager.lock.release()
        checkpoint_manager.timer = threading.Timer(
            options().checkpoint_interval,
            save_checkpoint,
            args=[checkpoint_manager])
        checkpoint_manager.timer.setDaemon(True)
        checkpoint_manager.timer.start()
Ejemplo n.º 36
0
def save_checkpoint(checkpoint_manager):
    '''
    This is the callback function that is called periodically to save the
    current state of the system.
    '''
    # Note: this module is not bullet proof in terms of race conditions.
    # Most importantly, it is possible (though extremely unlikely) that
    # while the new temp path is being written (f.write...)
    if checkpoint_manager.is_checkpointing:
        # checkpoint_manager.lock.acquire()
        checkpoint_manager.saving = True
        newTmpDest = get_temp_file("dump", "checkpoints")
        _LOG.info("Checkpoint is being updated: %s" % newTmpDest)
        oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines()
        oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split(
            ",")[0]

        checkpoint_manager.update_time()

        currenlimit = sys.getrecursionlimit()
        sys.setrecursionlimit(100000)
        picklefile = gzip.GzipFile(newTmpDest, 'wb')
        pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2)
        picklefile.close()
        sys.setrecursionlimit(currenlimit)

        f = open(checkpoint_manager.checkpoint_path, "a")
        f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now()))
        f.close()
        if oldTmpFile is not None:
            os.remove(oldTmpFile)
        _LOG.info("Checkpoint Saved to: %s and linked in %s." % (
            newTmpDest, checkpoint_manager.checkpoint_path))
        checkpoint_manager.saving = False
        # checkpoint_manager.lock.release()
        checkpoint_manager.timer = threading.Timer(
            options().checkpoint_interval, save_checkpoint,
            args=[checkpoint_manager])
        checkpoint_manager.timer.setDaemon(True)
        checkpoint_manager.timer.start()
Ejemplo n.º 37
0
    def testConfigFileMissingFile(self):

        config._options_singelton = None  # Just to make different test cases independent of each other.
        back = config.main_config_path
        config.main_config_path = os.path.expanduser(
            "~/.sepp/main.config.notexistentfile"
        )  # Diasable main config path for this test

        sys.argv = [
            sys.argv[0], "-c", "data/configs/test2.config", "-f",
            "data/simulated/test.fas"
        ]
        assert isinstance(options().config_file, file) and options(
        ).config_file.name == "data/configs/test2.config", "Commandline option -c not read properly"

        assert isinstance(options().alignment_file, file) and options(
        ).alignment_file.name == "data/simulated/test.small.fas", "Config file option alignment not read properly"

        assert isinstance(options().fragment_file, file) and options(
        ).fragment_file.name == "data/simulated/test.fas", "Command-line option -f alignment not read properly"

        print options()

        config.main_config_path = back
Ejemplo n.º 38
0
    def testConfigFileMissingFile(self):
        # Just to make different test cases independent of each other.
        config._options_singelton = None
        # Diasable main config path for this test
        config.main_config_path = self.fp_config

        sys.argv = [sys.argv[0],
                    "-c", get_data_path("configs/test2.config"),
                    "-f", get_data_path("simulated/test.fas"),
                    "-a", get_data_path("simulated/test.small.fas")]
        assert isinstance(options().config_file, filetypes) and \
            options().config_file.name.endswith(
                "data/configs/test2.config"), \
            "Commandline option -c not read properly"

        assert isinstance(options().alignment_file, filetypes) and\
            options().alignment_file.name.endswith(
                "data/simulated/test.small.fas"), \
            "Config file option alignment not read properly"

        assert isinstance(options().fragment_file, filetypes) and\
            options().fragment_file.name.endswith(
                "data/simulated/test.fas"), \
            "Command-line option -f alignment not read properly"
Ejemplo n.º 39
0
    def testConfigFileMissingFile(self):
        # Just to make different test cases independent of each other.
        config._options_singelton = None
        # Diasable main config path for this test
        config.main_config_path = self.fp_config

        sys.argv = [sys.argv[0],
                    "-c", get_data_path("configs/test2.config"),
                    "-f", get_data_path("simulated/test.fas"),
                    "-a", get_data_path("simulated/test.small.fas")]
        assert isinstance(options().config_file, filetypes) and \
            options().config_file.name.endswith(
                "data/configs/test2.config"), \
            "Commandline option -c not read properly"

        assert isinstance(options().alignment_file, filetypes) and\
            options().alignment_file.name.endswith(
                "data/simulated/test.small.fas"), \
            "Config file option alignment not read properly"

        assert isinstance(options().fragment_file, filetypes) and\
            options().fragment_file.name.endswith(
                "data/simulated/test.fas"), \
            "Command-line option -f alignment not read properly"
Ejemplo n.º 40
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (
                        seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]

            (min_length, max_length) = (
                int(options().median_full_length * (
                    1 - options().backbone_threshold)),
                int(options().median_full_length*(
                    1 + options().backbone_threshold)))
            frag_names = [
                name for name in sequences
                if len(sequences[name]) > max_length or
                len(sequences[name]) < min_length]
            if (len(frag_names) > 0):
                _LOG.info(
                    "Detected %d fragmentary sequences" % len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(random.sample(
            sorted(list(sequences.keys())), options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone, options().backbone_size,
                            moleculeType, options().cpu)
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s"
            % (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s"
                % self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)
Ejemplo n.º 41
0
    def testMainConfigFile(self):
        # Just to make different test cases independent of each other.
        config._options_singelton = None

        sys.argv = [sys.argv[0], "-c", get_data_path("configs/test2.config")]
        # set pplacer filepath to a file shipped with the code base
        options().pplacer.path = get_data_path(
            "../../../tools/bundled/Darwin/pplacer")

        assert (options().pplacer is not None and os.path.exists(
                options().pplacer.path)), \
            ("main config file options not read properly,"
             "or nonexistent binaries: pplacer = %s" %
             options().pplacer.path)

        options().hmmalign.path = get_data_path(
            "../../../tools/bundled/Darwin/hmmalign")
        assert (options().hmmalign is not None and os.path.exists(
                options().hmmalign.path)), \
            ("main config file options not read properly, or nonexistent "
             "binaries: hmmalign = %s" % options().hmmalign.path)

        options().hmmsearch.path = get_data_path(
            "../../../tools/bundled/Darwin/hmmsearch")
        assert (options().hmmsearch is not None and os.path.exists(
                options().hmmsearch.path)), \
            ("main config file options not read properly, or nonexistent "
             "binaries: hmmsearch = %s" % options().hmmsearch.path)
Ejemplo n.º 42
0
def build_profile(input, output_directory):
    global taxon_map, level_map, key_map, levels
    temp_dir = tempfile.mkdtemp(dir=options().__getattribute__('tempdir'))
    if (options().bin == 'blast'):
        binned_fragments = blast_to_markers(input, temp_dir)
    else:
        binned_fragments = hmmer_to_markers(input, temp_dir)

    if binned_fragments:
        print("Finished binning")
    else:
        print("Unable to bin any fragments!\n")
        return

    # load up taxonomy for 30 marker genes
    if (options().genes == 'markers'):
        (taxon_map, level_map, key_map) = load_taxonomy(
            os.path.join(options().reference.path,
                         'refpkg/rpsB.refpkg/all_taxon.taxonomy'))
    else:
        (taxon_map, level_map, key_map) = load_taxonomy(
            os.path.join(options().reference.path,
                         'refpkg/COG0012.refpkg/all_taxon.taxonomy'))

    # all classifications stored here
    classifications = {}
    classification_files = []
    # Now run TIPP on each fragment
    gene_name = 'sate'
    if (options().genes == 'cogs'):
        gene_name = 'pasta'
    for (gene, frags) in binned_fragments.items():
        # Get size of each marker
        total_taxa = 0
        with open(
                os.path.join(options().__getattribute__('reference').path,
                             'refpkg/%s.refpkg/%s.size' % (gene, gene_name)),
                'r') as f:
            total_taxa = int(f.readline().strip())
        decomp_size = options().alignment_size
        if (decomp_size > total_taxa):
            decomp_size = int(total_taxa / 2)
        cpus = options().cpu
        if (len(frags) < cpus):
            cpus = len(frags)
        extra = ''
        if options().dist is True:
            extra = '-D'
        if options().max_chunk_size is not None:
            extra = extra + '-F %d' % options().max_chunk_size
        if options().cutoff != 0:
            extra = extra + " -C %f" % options().cutoff
        print(
            ('Cmd:\nrun_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a '
             '%s -r %s -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s '
             '-o %s -d %s %s') %
            (options().config_file.name, cpus, options().molecule,
             temp_dir + "/%s.frags.fas.fixed" % gene,
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/%s.taxonomy' % (gene, gene_name)),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/%s.tree' % (gene, gene_name)),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/%s.fasta' % (gene, gene_name)),
             os.path.join(
                 options().__getattribute__('reference').path,
                 'refpkg/%s.refpkg/%s.taxonomy.RAxML_info' %
                 (gene, gene_name)),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/all_taxon.taxonomy' % gene),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/species.mapping' % gene),
             options().alignment_threshold, 0, decomp_size, total_taxa,
             temp_dir + "/temp_file", "tipp_%s" % gene,
             output_directory + "/markers/", extra))

        os.system(
            ('run_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a %s -r %s'
             ' -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s -o %s -d '
             '%s %s') %
            (options().config_file.name, cpus, options().molecule,
             temp_dir + "/%s.frags.fas.fixed" % gene,
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/%s.taxonomy' % (gene, gene_name)),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/%s.tree' % (gene, gene_name)),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/%s.fasta' % (gene, gene_name)),
             os.path.join(
                 options().__getattribute__('reference').path,
                 'refpkg/%s.refpkg/%s.taxonomy.RAxML_info' %
                 (gene, gene_name)),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/all_taxon.taxonomy' % gene),
             os.path.join(options().__getattribute__('reference').path,
                          'refpkg/%s.refpkg/species.mapping' % gene),
             options().alignment_threshold, 0, decomp_size, total_taxa,
             temp_dir + "/temp_file", "tipp_%s" % gene,
             output_directory + "/markers/", extra))
        if (not os.path.exists(output_directory +
                               "/markers/tipp_%s_classification.txt" % gene)):
            continue

        gene_classification = generate_classification(
            output_directory + "/markers/tipp_%s_classification.txt" % gene,
            options().placement_threshold)
        classification_files.append(output_directory +
                                    "/markers/tipp_%s_classification.txt" %
                                    gene)
        # Now write individual classification and also pool classifications
        write_classification(
            gene_classification,
            output_directory + "/markers/tipp_%s.classification" % gene)
        classifications.update(gene_classification)
    remove_unclassified_level(classifications)
    write_classification(classifications,
                         output_directory + "/markers/all.classification")
    write_abundance(classifications, output_directory)

    if (options().dist is True):
        distribution(classification_files, output_directory)
Ejemplo n.º 43
0
    def testConfigFile(self):
        config._options_singelton = None  # Just to make different test cases independent of each other.
        back = config.main_config_path
        config.main_config_path = os.path.expanduser(
            "~/.sepp/main.config.notexistentfile"
        )  # Diasable main config path for this test
        sys.argv = [
            sys.argv[0], "-A", "2", "-c", "data/configs/test.config",
            "--outdir", "dir_form_commandline"
        ]

        assert options(
        ).alignment_size == 2, "Commandline option -A not read properly"

        assert isinstance(options().config_file, file) and options(
        ).config_file.name == "data/configs/test.config", "Commandline option -c not read properly"

        assert (options().pplacer is not None and options().pplacer.path
                == "pplacer"), "config file options not read properly"

        assert options(
        ).placement_size == 10, "Config file option placementSize not read properly"

        assert options().outdir.endswith(
            "dir_form_commandline"
        ), "Config file value outdir is not properly overwritten:%s " % options(
        ).outdir

        assert options(
        ).tempdir is not None, "Default value not properly set for tempfile attribute"

        print options()

        config.main_config_path = back
Ejemplo n.º 44
0
 def check_options(self, supply=[]):
     if (options().info_file is None):
         supply = supply + ["raxml file"]
     AbstractAlgorithm.check_options(self, supply)
Ejemplo n.º 45
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        #Check to see if tree/alignment/fragment file provided, if not, generate it
        #from sequence file                
        if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None:
            options().fragment_file = options().sequence_file        
        elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None:            
            self.generate_backbone()
        else:
            _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences.  Any other combination is invalid")
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))            
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size        
        assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" 
                %(options().backbone_size, backbone_size))                    
        if options().placement_size is None:
            options().placement_size = options().backbone_size
        return ExhaustiveAlgorithm.check_options(self)
Ejemplo n.º 46
0
 def check_options(self, supply=[]):
     if (options().taxonomy_file is None):
         supply = supply + ["taxonomy file"]
     if (options().taxonomy_name_mapping_file is None):
         supply = supply + ["taxonomy name mapping file"]
     ExhaustiveAlgorithm.check_options(self, supply)
Ejemplo n.º 47
0
def build_profile(input, output_directory):
    global taxon_map, level_map, key_map, levels, refpkg

    temp_dir = tempfile.mkdtemp(dir=options().__getattribute__('tempdir'))

    # New option to allow fastq files as input
    input = to_fasta(input, temp_dir)

    if (options().bin == "hmmer"):
        binned_fragments = hmmer_to_markers(input, temp_dir)
    else:
        binned_fragments = blast_to_markers(input, temp_dir)

    for gene in refpkg["genes"]:
        try:
            if binned_fragments[gene]["nfrags"] == 0:
                del binned_fragments[gene]
        except KeyError:
            pass

    if options().gene is not None:
        keep = set(options().gene.split(','))
        for gene in refpkg["genes"]:
            if gene not in keep:
                try:
                    del binned_fragments[gene]
                    print("Removed reads that hit %s" % gene)
                except KeyError:
                    pass

    if binned_fragments:
        print("Finished binning")
    else:
        print("Unable to bin any fragments!\n")
        with open(output_directory + "/abundance.phylum.csv", 'w') as f:
            f.write("Unable to create an abundance profile, because"
                    " none of the input sequences mapped to the"
                    " marker gene(s).")
        return

    # Load up taxonomy for marker genes
    (taxon_map, level_map, key_map) = \
        load_taxonomy(refpkg["taxonomy"]["taxonomy"])

    # Store all classifications here
    classifications = {}
    classification_files = []

    # Run TIPP on each fragment
    for gene in binned_fragments.keys():
        # Set placement subset size to equal the size of each marker
        with open(refpkg[gene]["size"], 'r') as f:
            total_taxa = int(f.readline().strip())
        default_subset_size = int(total_taxa * 0.10)

        # Set alignment size and placement size
        alignment_size = options().alignment_size
        placement_size = options().placement_size

        if alignment_size is None:
            if placement_size is None:
                alignment_size = default_subset_size
            else:
                alignment_size = placement_size

        if placement_size is None:
            # placement_size = max(default_subset_size, alignment_size)
            placement_size = 10000  # Needs to be large

        if alignment_size > total_taxa:
            alignment_size = total_taxa

        if placement_size > total_taxa:
            placement_size = total_taxa

        if alignment_size != placement_size:
            if placement_size < total_taxa:
                sys.exit("Alignment decomposition tree can be different from"
                         " placement tree only if the placement subset size"
                         " is set to the number of taxa")
        if (refpkg[gene]["alignment-decomposition-tree"] ==
                refpkg[gene]["placement-tree"]) or \
                (placement_size == total_taxa):
            pass
        else:
            print("Alignment decomposition tree can be different from"
                  " placement tree only if the placement subset size"
                  " is set to the number of taxa"
                  " (note: marker %s has %d taxa)" % (gene, total_taxa))
            return

        # Set number of CPUS
        cpus = options().cpu
        if binned_fragments[gene]["nfrags"] < cpus:
            cpus = binned_fragments[gene]["nfrags"]

        # Set extra arguments
        extra = ''
        if options().dist is True:
            extra = "-D"
        if options().max_chunk_size is not None:
            extra = extra + "-F %d" % options().max_chunk_size
        if options().cutoff != 0:
            extra = extra + " -C %f" % options().cutoff

        cmd = "run_tipp.py " \
            + " -c " + tipp_config_path \
            + " --cpu " + str("%d" % cpus) \
            + " -m " + options().molecule \
            + " -f " + binned_fragments[gene]["file"] \
            + " -t " + refpkg[gene]["placement-tree"] \
            + " -adt " + refpkg[gene]["alignment-decomposition-tree"] \
            + " -a " + refpkg[gene]["alignment"] \
            + " -r " + refpkg[gene]["raxml-info-for-placement-tree"] \
            + " -tx " + refpkg["taxonomy"]["taxonomy"] \
            + " -txm " + refpkg[gene]["seq-to-taxid-map"] \
            + " -at " + str("%0.2f" % options().alignment_threshold) \
            + " -pt 0.0" \
            + " -A " + str("%d" % alignment_size) \
            + " -P " + str("%d" % placement_size) \
            + " -p " + temp_dir + "/temp_file" \
            + " -o tipp_" + gene \
            + " -d " + output_directory + "/markers/ " \
            + extra

        print(cmd)
        os.system(cmd)

        tipp_output = output_directory + "/markers/tipp_" + gene \
            + "_classification.txt"

        if (not os.path.exists(tipp_output)):
            continue

        classification_files.append(tipp_output)

        gene_classification = generate_classification(
            tipp_output,
            options().placement_threshold)

        # Apply placement threshold to classification data
        gene_classification_output = output_directory \
            + "/markers/tipp_" + gene + "_classification_" \
            + str("%0.2f" % options().placement_threshold) + ".txt"

        gene_classification = generate_classification(
            tipp_output,
            options().placement_threshold)

        write_classification(gene_classification, gene_classification_output)

        # Pool classification
        classifications.update(gene_classification)

    remove_unclassified_level(classifications)
    write_classification(classifications,
                         output_directory + "/markers/all.classification")
    write_abundance(classifications, output_directory)

    if (options().dist is True):
        distribution(classification_files, output_directory)
Ejemplo n.º 48
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None
                or options().full_length_range is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (seq_lengths[l2] +
                                                    seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]
            if options().full_length_range is not None:
                L = sorted(int(x) for x in options().full_length_range.split())
                min_length = L[0]
                max_length = L[1]
            else:
                (min_length,
                 max_length) = (int(options().median_full_length *
                                    (1 - options().backbone_threshold)),
                                int(options().median_full_length *
                                    (1 + options().backbone_threshold)))
            _LOG.info(
                "Full length sequences are set to be from %d to %d character long"
                % (min_length, max_length))
            frag_names = [
                name for name in sequences if len(sequences[name]) > max_length
                or len(sequences[name]) < min_length
            ]
            if (len(frag_names) > 0):
                _LOG.info("Detected %d fragmentary sequences" %
                          len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(
            random.sample(sorted(list(sequences.keys())),
                          options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone,
                            options().backbone_size, moleculeType,
                            options().cpu, **vars(options().pasta))
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s" %
                self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)
Ejemplo n.º 49
0
    def check_options(self):
        self.check_outputprefix()
        options().info_file = "A_dummy_value"

        # Check to see if tree/alignment/fragment file provided, if not,
        # generate it from sequence file
        if ((not options().tree_file is None)
                and (not options().alignment_file is None)
                and (not options().sequence_file is None)):
            options().fragment_file = options().sequence_file
        elif ((options().tree_file is None)
              and (options().alignment_file is None)
              and (not options().sequence_file is None)):
            self.generate_backbone()
        else:
            _LOG.error(
                ("Either specify the backbone alignment and tree and query "
                 "sequences or only the query sequences.  Any other "
                 "combination is invalid"))
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size
        assert options().backbone_size == backbone_size, (
            ("Backbone parameter needs to match actual size of backbone; "
             "backbone parameter:%s backbone_size:%s") %
            (options().backbone_size, backbone_size))
        if options().placement_size is None:
            options().placement_size = options().backbone_size

        if options().backtranslation_sequence_file and \
                options().molecule != "amino":
            _LOG.error(("Backtranslation can be performed only when "
                        "input sequences are amino acid. "))
            exit(-1)

        return ExhaustiveAlgorithm.check_options(self)
Ejemplo n.º 50
0
    def testMainConfigFile(self):

        config._options_singelton = None  # Just to make different test cases independent of each other.

        sys.argv = [sys.argv[0]]

        assert (
            options().pplacer is not None
            and os.path.exists(options().pplacer.path)
        ), ("main config file"
            "options not read properly, or nonexistent binaries: pplacer = %s"
            % options().pplacer.path)

        assert (
            options().hmmalign is not None
            and os.path.exists(options().hmmalign.path)
        ), ("main config file"
            "options not read properly, or nonexistent binaries: hmmalign = %s"
            % options().pplacer.path)

        assert (
            options().hmmsearch is not None
            and os.path.exists(options().hmmalign.path)
        ), ("main config file"
            "options not read properly, or nonexistent binaries: hmmsearch = %s"
            % options().pplacer.path)

        print options()
Ejemplo n.º 51
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     fragments = MutableAlignment()
     if (options().median_full_length is not None):
       if (options().median_full_length == -1):
         seq_lengths = sorted([len(seq) for seq in sequences.values()])              
         lengths = len(seq_lengths)
         if lengths % 2:
           options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0
         else:
           options().median_full_length = seq_lengths[lengths / 2]              
         
       (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold)))
       frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length]
       if (len(frag_names) > 0):
           fragments = sequences.get_hard_sub_alignment(frag_names)        
           [sequences.pop(i) for i in fragments.keys()]        
     if (options().backbone_size is None):            
         options().backbone_size = min(1000,int(sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     if (options().backbone_size > len(sequences.keys())):
       options().backbone_size = len(sequences.keys())
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
             
     _LOG.info("Writing backbone set. ")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(backbone_sequences, backbone)
      
     _LOG.info("Generating pasta backbone alignment and tree. ")
     pastaalignJob = PastaAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     pastaalignJob.run()
     pastaalignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/pasta.fasta")
     options().tree_file = open(self.options.outdir + "/pasta.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     sequences.set_alignment(fragments)        
     if (len(sequences) == 0):
       _LOG.info("No query sequences to align.  Final alignment saved as %s" % self.get_output_filename("alignment.fasta"))   
       shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta"))
       sys.exit(0)
     else:
       query = get_temp_file("query", "backbone", ".fas")
       options().fragment_file = query          
       _write_fasta(sequences, query)               
Ejemplo n.º 52
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        #Check to see if tree/alignment/fragment file provided, if not, generate it
        #from sequence file
        if not options().tree_file is None and not options(
        ).alignment_file is None and not options().sequence_file is None:
            options().fragment_file = options().sequence_file
        elif options().tree_file is None and options(
        ).alignment_file is None and not options().sequence_file is None:
            self.generate_backbone()
        else:
            _LOG.error(
                "Either specify the backbone alignment and tree and query sequences or only the query sequences.  Any other combination is invalid"
            )
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size
        assert options().backbone_size == backbone_size, (
            "Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s"
            % (options().backbone_size, backbone_size))
        if options().placement_size is None:
            options().placement_size = options().backbone_size
        if options().alignment_size is None:
            _LOG.info(
                "Alignment subset size not given.  Calculating subset size. ")
            alignment = MutableAlignment()
            alignment.read_file_object(open(self.options.alignment_file.name))
            if (options().molecule == 'amino'):
                _LOG.warning(
                    "Automated alignment subset selection not implemented for protein alignment.  Setting to 10."
                )
                options().alignment_size = 10
            else:
                (averagep, maxp) = alignment.get_p_distance()
                align_size = 10
                if (averagep > .60):
                    while (align_size * 2 < alignment.get_num_taxa()):
                        align_size = align_size * 2
                _LOG.info(
                    "Average p-distance of backbone is %f0.2.  Alignment subset size set to %d. "
                    % (averagep, align_size))
                options().alignment_size = align_size
        return ExhaustiveAlgorithm.check_options(self)
Ejemplo n.º 53
0
def blast_fragments(input, output):
  '''Blast the fragments against all marker genes+16S sequences, return output
  '''
  os.system('%s -db %s -outfmt 6 -query %s -out %s -num_threads %d -max_target_seqs 1 ' % (options().__getattribute__('blast').path, os.path.join(options().__getattribute__('reference').path, "blast/%s/alignment.fasta.db" % options().genes), input, output,options().cpu))
Ejemplo n.º 54
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        if (options().backbone_size is None):
            options().backbone_size = min(100,
                                          int(.20 * sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(
            random.sample(sequences.keys(),
                          options().backbone_size))
        [sequences.pop(i) for i in backbone_sequences.keys()]

        _LOG.info("Writing query and backbone set. ")
        query = get_temp_file("query", "backbone", ".fas")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(sequences, query)
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating sate backbone alignment and tree. ")
        satealignJob = SateAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        satealignJob.setup(backbone,
                           options().backbone_size, self.options.outdir,
                           moleculeType,
                           options().cpu)
        satealignJob.run()
        satealignJob.read_results()

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(self.options.outdir + "/sate.fasta")
        options().tree_file = open(self.options.outdir + "/sate.fasttree")
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        options().fragment_file = query
Ejemplo n.º 55
0
def build_profile(input, output_directory):
    global taxon_map, level_map, key_map, levels
    temp_dir = tempfile.mkdtemp(dir=options().__getattribute__('tempdir'))
    binned_fragments = bin_to_markers(input, temp_dir)

    #load up taxonomy for 30 marker genes
    (taxon_map, level_map,
     key_map) = load_taxonomy(options().__getattribute__('reference').path +
                              'refpkg/rpsB.refpkg/all_taxon.taxonomy')

    #all classifications stored here
    classifications = {}

    #Now run TIPP on each fragment
    for (gene, frags) in binned_fragments.items():
        #Get size of each marker
        total_taxa = 0
        with open(
                options().__getattribute__('reference').path +
                'refpkg/%s.refpkg/sate.size' % gene, 'r') as f:
            total_taxa = int(f.readline().strip())
        decomp_size = options().alignment_size
        if (decomp_size > total_taxa):
            decomp_size = int(total_taxa / 2)
        cpus = options().cpu
        if (len(frags.keys()) < cpus):
            cpus = len(frags.keys())
        os.system(
            'run_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a %s -r %s -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s -o %s -d %s'
            % (options().config_file.name, cpus,
               options().molecule, temp_dir + "/%s.frags.fas.fixed" % gene,
               options().__getattribute__('reference').path +
               'refpkg/%s.refpkg/sate.taxonomy' % gene,
               options().__getattribute__('reference').path +
               'refpkg/%s.refpkg/sate.tree' % gene,
               options().__getattribute__('reference').path +
               'refpkg/%s.refpkg/sate.fasta' % gene,
               options().__getattribute__('reference').path +
               'refpkg/%s.refpkg/sate.taxonomy.RAxML_info' % gene,
               options().__getattribute__('reference').path +
               'refpkg/%s.refpkg/all_taxon.taxonomy' % gene,
               options().__getattribute__('reference').path +
               'refpkg/%s.refpkg/species.mapping' % gene,
               options().alignment_threshold, options().placement_threshold,
               decomp_size, total_taxa, temp_dir + "/temp_file",
               "tipp_%s" % gene, output_directory + "/markers/"))
        if (not os.path.exists(output_directory +
                               "/markers/tipp_%s_classification.txt" % gene)):
            continue

        gene_classification = generate_classification(
            output_directory + "/markers/tipp_%s_classification.txt" % gene, 0)

        #Now write individual classification and also pool classifications
        write_classification(
            gene_classification,
            output_directory + "/markers/tipp_%s.classification.0" % gene)
        classifications.update(gene_classification)
    remove_unclassified_level(classifications)
    write_classification(classifications,
                         output_directory + "/markers/all.classification.0")
    write_abundance(classifications, output_directory)
Ejemplo n.º 56
0
def hmmer_search(input, hmmer, output):
    '''Blast the fragments against all marker genes+16S sequences, return
    output'''
    os.system('%s --noali -E 10000 --cpu %d -o %s %s %s' %
              (options().__getattribute__('hmmsearch').path, options().cpu,
               output, hmmer, input))
Ejemplo n.º 57
0
    def build_subproblems(self):
        (alignment, tree) = self.read_alignment_and_tree()

        if options().distance != 1:
            self.compute_distances(alignment)

        assert isinstance(tree, PhylogeneticTree)
        assert isinstance(alignment, MutableAlignment)

        tree.get_tree().resolve_polytomies()
        # Label edges with numbers so that we could assemble things back
        # at the end
        tree.lable_edges()
        ''' Make sure size values are set, and are meaningful. '''
        self.check_and_set_sizes(alignment.get_num_taxa())

        self._create_root_problem(tree, alignment)
        ''' Decompose the tree based on placement subsets'''
        placement_tree_map = PhylogeneticTree(Tree(
            tree.den_tree)).decompose_tree(
                self.options.placement_size,
                strategy=self.strategy,
                minSize=self.options.placement_size /
                int(self.options.exhaustive.placementminsubsetsizefacotr),
                tree_map={},
                pdistance=1,
                decomp_strategy=self.decomp_strategy,
                distances=self.distances,
                maxDiam=None)
        assert len(placement_tree_map) > 0, (
            "Tree could not be decomposed"
            " given the following settings; strategy:%s minsubsetsize:%s"
            " placement_size:%s" %
            (self.strategy, self.minsubsetsize, self.options.placement_size))
        _LOG.info("Breaking into %d placement subsets." %
                  len(placement_tree_map))
        ''' For placement subsets create a placement subproblem,
            and decompose further'''
        for (p_key, p_tree) in placement_tree_map.items():
            assert isinstance(p_tree, PhylogeneticTree)
            placement_problem = SeppProblem(p_tree.leaf_node_names(),
                                            self.root_problem)
            placement_problem.subtree = p_tree
            placement_problem.label = "P_%s" % str(p_key)
            _LOG.debug(
                "Placement subset %s has %d nodes" %
                (placement_problem.label, len(p_tree.leaf_node_names())))
            ''' Further decompose to alignment subsets '''
            alignment_tree_map = PhylogeneticTree(Tree(
                p_tree.den_tree)).decompose_tree(
                    self.options.alignment_size,
                    strategy=self.strategy,
                    minSize=self.minsubsetsize,
                    tree_map={},
                    decomp_strategy=self.options.decomp_strategy,
                    pdistance=options().distance,
                    distances=self.distances,
                    maxDiam=self.options.maxDiam)
            assert len(alignment_tree_map) > 0, (
                "Tree could not be decomposed"
                " given the following settings; strategy:%s"
                " minsubsetsize:%s alignmet_size:%s" %
                (self.strategy, self.minsubsetsize,
                 self.options.alignment_size))

            _LOG.debug("Placement subset %s has %d alignment subsets: %s" %
                       (placement_problem.label, len(alignment_tree_map),
                        str(sorted(alignment_tree_map.keys()))))
            _LOG.debug("Placement subset %s has %d taxa:" %
                       (placement_problem.label,
                        sum([
                            len(a_tree.leaf_node_names())
                            for a_tree in alignment_tree_map.values()
                        ])))
            for (a_key, a_tree) in alignment_tree_map.items():
                assert isinstance(a_tree, PhylogeneticTree)
                self.modify_tree(a_tree)
                alignment_problem = SeppProblem(a_tree.leaf_node_names(),
                                                placement_problem)
                alignment_problem.subtree = a_tree
                alignment_problem.label = "A_%s_%s" % (str(p_key), str(a_key))

        _LOG.info("Breaking into %d alignment subsets." %
                  (len(list(self.root_problem.iter_leaves()))))
        ''' Divide fragments into chunks, to help achieve better parallelism'''
        fragment_chunk_files = self.create_fragment_files()
        self.root_problem.fragment_chunks = len(fragment_chunk_files)
        for alignment_problem in self.root_problem.iter_leaves():
            for afc in range(0, self.root_problem.fragment_chunks):
                frag_chunk_problem = SeppProblem(alignment_problem.taxa,
                                                 alignment_problem)
                frag_chunk_problem.subtree = alignment_problem.subtree
                frag_chunk_problem.label = alignment_problem.label.replace(
                    "A_", "FC_") + "_" + str(afc)
                frag_chunk_problem.fragments = fragment_chunk_files[afc]

        _LOG.info("Breaking each alignment subset into %d fragment chunks." %
                  self.root_problem.fragment_chunks)
        _LOG.debug("Subproblem structure: %s" % str(self.root_problem))
        return self.root_problem
Ejemplo n.º 58
0
def blast_to_markers(input, temp_dir):
    """
    Function based on:
    https://github.com/shahnidhi/tipp2_scripts/blob/master/get_marker_assignment.py
    """
    global refpkg

    # Handle input
    with open(input, 'r') as fp:
        line = fp.readline()
    if line[0] != '>':
        sys.exit("%s is not a FASTA file; please reformat for BLAST." % input)
    if len(line.split(" ")) > 1:
        sys.exit("%s contains spaces; please reformat for BLAST." % input)

    # First blast sequences against all markers
    blast_results = temp_dir + "/blast.out"
    if (options().blast_file is None):
        print("Blasting fragments against marker dataset\n")
        blast_fragments(input, blast_results)
    else:
        blast_results = options().blast_file

    # Next bin the blast hits to the best gene
    hitinfo = bin_blast_results(blast_results)

    binned_fragments = {}
    for gene in refpkg["genes"]:
        binned_fragments[gene] = {}
        binned_fragments[gene]["file"] = temp_dir + '/' + gene \
            + ".frags.fas.fixed"
        binned_fragments[gene]["fptr"] = \
            open(binned_fragments[gene]["file"], 'w')
        binned_fragments[gene]["nfrags"] = 0

    f = open(temp_dir + "/blast-binned.out", 'w')
    f.write("qseqid,sseqid,marker,trim_qstart,trim_qend,qlen\n")

    # if input.lower().endswith((".fastq", ".fq")):
    #     fiter = fastq_iter(input)
    # elif input.lower().endswith((".fasta", ".fas", ".fa", ".fna")):
    fiter = fasta_iter(input)

    for ff in fiter:
        header = ff[0]
        seq = ff[1]

        found = True
        try:
            gene = hitinfo[header]["gene"]
        except KeyError:
            found = False

        if found:
            sseqid = hitinfo[header]["sseqid"]
            qstart = hitinfo[header]["qstart"]
            qend = hitinfo[header]["qend"]
            qlen = hitinfo[header]["qlen"]
            sstart = hitinfo[header]["sstart"]
            send = hitinfo[header]["send"]
            slen = hitinfo[header]["slen"]

            trim_qstart = 0
            trim_qend = qlen
            if not options().no_trim:
                extra_qstart = qstart - 1
                extra_qend = qlen - qend

                if sstart < send:
                    extra_sstart = sstart - 1
                    extra_send = slen - send
                else:
                    extra_sstart = slen - sstart
                    extra_send = send - 1

                if extra_qstart > 2 * extra_sstart:
                    trim_qstart = qstart - 1
                    seq = seq[trim_qstart:]

                if extra_qend > 2 * extra_send:
                    trim_qend = qend
                    seq = seq[:trim_qend]

            if sstart > send:
                seq = reverse_sequence(seq)

            binned_fragments[gene]["fptr"].write('>' + header + '\n')
            binned_fragments[gene]["fptr"].write(seq + '\n')
            binned_fragments[gene]["nfrags"] += 1
            f.write(header + ',' + sseqid + ',' + gene + ',' +
                    str(trim_qstart + 1) + ',' + str(trim_qend) + ',' +
                    str(qlen) + '\n')

    for gene in refpkg["genes"]:
        binned_fragments[gene]["fptr"].close()
    f.close()

    return binned_fragments