def build_profile(input,output_directory): global taxon_map,level_map,key_map,levels temp_dir=tempfile.mkdtemp(dir=options().__getattribute__('tempdir')) binned_fragments=bin_to_markers(input,temp_dir) #load up taxonomy for 30 marker genes (taxon_map, level_map, key_map) = load_taxonomy(options().__getattribute__('reference').path + 'refpkg/rpsB.refpkg/all_taxon.taxonomy') #all classifications stored here classifications = {} #Now run TIPP on each fragment for (gene,frags) in binned_fragments.items(): #Get size of each marker total_taxa = 0 with open(options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.size'%gene, 'r') as f: total_taxa = int(f.readline().strip()) decomp_size = options().alignment_size if (decomp_size > total_taxa): decomp_size = int(total_taxa/2) cpus = options().cpu if (len(frags.keys()) < cpus): cpus = len(frags.keys()) os.system('run_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a %s -r %s -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s -o %s -d %s' % (options().config_file.name, cpus, options().molecule, temp_dir+"/%s.frags.fas.fixed" % gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.taxonomy'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.tree'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.fasta'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.taxonomy.RAxML_info'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/all_taxon.taxonomy'%gene,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/species.mapping'%gene,options().alignment_threshold,options().placement_threshold,decomp_size,total_taxa,temp_dir+"/temp_file","tipp_%s" % gene,output_directory+"/markers/")) if (not os.path.exists(output_directory+"/markers/tipp_%s_classification.txt" % gene)): continue gene_classification = generate_classification(output_directory+"/markers/tipp_%s_classification.txt" % gene,0) #Now write individual classification and also pool classifications write_classification(gene_classification, output_directory+"/markers/tipp_%s.classification.0" % gene) classifications.update(gene_classification) remove_unclassified_level(classifications) write_classification(classifications, output_directory+"/markers/all.classification.0") write_abundance(classifications,output_directory)
def hmmer_search(input, hmmer, output): """Blast the fragments against all marker genes+16S sequences, return output """ os.system( "%s --noali -E 10000 --cpu %d -o %s %s %s" % (options().__getattribute__("hmmsearch").path, options().cpu, output, hmmer, input) )
def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100,int(.20*sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def check_options(self, supply=[]): if (options().reference_pkg is not None): self.load_reference(os.path.join(options().reference.path, 'refpkg/%s.refpkg/' % options().reference_pkg)) if (options().taxonomy_file is None): supply = supply + ["taxonomy file"] if (options().taxonomy_name_mapping_file is None): supply = supply + ["taxonomy name mapping file"] ExhaustiveAlgorithm.check_options(self, supply)
def main(): augment_parser() sepp.config._options_singelton = sepp.config._parse_options() if (options().alignment_size is None): options().alignment_size = 100 input = options().fragment_file.name output_directory = options().outdir build_profile(input, output_directory)
def check_options(self): options().info_file = "A_dummy_value" if options().tree_file is None or options().alignment_file is None: _LOG.error("Specify the backbone alignment and tree and query sequences") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) return ExhaustiveAlgorithm.check_options(self)
def load_reference(self, reference_pkg): file = open(reference_pkg + 'CONTENTS.json') result=json.load(file) file.close() options().taxonomy_name_mapping_file = open(reference_pkg + result['files']['seq_info']) options().taxonomy_file = open(reference_pkg + result['files']['taxonomy']) options().alignment_file = open(reference_pkg + result['files']['aln_fasta']) options().tree_file = open(reference_pkg + result['files']['tree']) options().info_file = reference_pkg + result['files']['tree_stats']
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name+'_rev', reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir+"/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, 'NA', 'NA']) for name in fragments.keys()]) gene_set = marker_genes align_name = 'sate' if (options().genes == 'cogs'): gene_set = cog_genes align_name = 'pasta' for gene in gene_set: # Now run HMMER search hmmer_search( frag_file, os.path.join( options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%.profile' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for name, value in results.items(): bitscore = value[1] direction = 'forward' true_name = name if (name.find('_rev') != -1): true_name = true_name.replace('_rev', '') direction = 'reverse' if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name, val in frag_scores.items(): if (val[1] not in genes): genes[val[1]] = {} if (val[2] == 'forward'): genes[val[1]][name] = fragments[name] else: genes[val[1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene, seq in genes.items(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(seq, gene_file + ".fixed") return genes
def __init__(self): AbstractAlgorithm.__init__(self) self.place_nomatch_fragments = False ''' Hardcoded E-Lim for hmmsearch ''' #TODO: what to do with this self.elim = 99999999 self.filters = False self.strategy = options().exhaustive.strategy self.minsubsetsize = int(options().exhaustive.minsubsetsize) #Temp fix for now, self.molecule = self.options.molecule
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir + "/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, "NA", "NA"]) for name in fragments.keys()]) gene_set = marker_genes align_name = "sate" if options().genes == "cogs": gene_set = cog_genes align_name = "pasta" for gene in gene_set: # Now run HMMER search hmmer_search( frag_file, os.path.join( options().__getattribute__("reference").path, "refpkg/%s.refpkg/%.profile" % (gene, align_name) ), temp_dir + "/%s.out" % gene, ) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for name in results.keys(): bitscore = results[name][1] direction = "forward" true_name = name if name.find("_rev") != -1: true_name = true_name.replace("_rev", "") direction = "reverse" if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name in frag_scores.keys(): if frag_scores[name][1] not in genes: genes[frag_scores[name][1]] = {} if frag_scores[name][2] == "forward": genes[frag_scores[name][1]][name] = fragments[name] else: genes[frag_scores[name][1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene in genes.keys(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(genes[gene], gene_file + ".fixed") return genes
def testConfigFile(self): # Just to make different test cases independent of each other. config._options_singelton = None # Diasable main config path for this test config.main_config_path = self.fp_config sys.argv = [ sys.argv[0], "-A", "2", "-c", get_data_path("configs/test.config"), "--outdir", "dir_form_commandline"] assert options().alignment_size == 2, \ "Commandline option -A not read properly" assert isinstance(options().config_file, filetypes) and \ options().config_file.name.endswith("data/configs/test.config"), \ "Commandline option -c not read properly" assert (options().pplacer is not None and options().pplacer.path == "pplacer"), \ "config file options not read properly" assert options().placement_size == 10, \ "Config file option placementSize not read properly" assert options().outdir.endswith("dir_form_commandline"), \ "Config file value outdir is not properly overwritten:%s " % \ options().outdir assert options().tempdir is not None, \ "Default value not properly set for tempfile attribute"
def testCpuCount(self): config._options_singelton = None # Just to make different test cases independent of each other. back = config.main_config_path config.main_config_path = os.path.expanduser("~/.sepp/main.config.notexistentfile") # Diasable main config path for this test sys.argv = [sys.argv[0], "-x" ,"7"] assert options().cpu == 7, "Commandline option -x not read properly" print options() config.main_config_path = back
def blast_fragments(input, output): '''Blast the fragments against all marker genes+16S sequences, return output''' os.system( ('%s -db %s -outfmt 6 -query %s -out %s -num_threads %d ' '-max_target_seqs 1 ') % (options().__getattribute__('blast').path, os.path.join( options().__getattribute__('reference').path, "blast/%s/alignment.fasta.db" % options().genes), input, output, options().cpu))
def blast_fragments(input, output): """Blast the fragments against all marker genes+16S sequences, return output """ os.system( "%s -db %s -outfmt 6 -query %s -out %s -num_threads %d -max_target_seqs 1 " % ( options().__getattribute__("blast").path, os.path.join(options().__getattribute__("reference").path, "blast/%s/alignment.fasta.db" % options().genes), input, output, options().cpu, ) )
def __init__(self, **kwargs): self.job_type = 'jsonmerger' ExternalSeppJob.__init__(self, self.job_type, **kwargs) self.out_file = None self.distribution = False self.taxonomy = None self.mapping = None self.threshold = None self.classification_file = None self.elim = float(options().hmmsearch.elim) if options().hmmsearch.filters.upper() == "TRUE": self.filters = True else: if options().hmmsearch.filters.upper() == "FALSE": self.filters = False else: self.filters = None if self.filters is None: raise Exception( "Expecting true/false for options().hmmsearch.filters") self.strategy = options().exhaustive.strategy self.minsubsetsize = int(options().exhaustive.minsubsetsize) self.alignment_threshold = float(options().alignment_threshold) self.molecule = options().molecule self.placer = options().exhaustive.__dict__['placer'].lower() self.cutoff = 0
def run(self): checkpoint_manager = options().checkpoint assert isinstance(checkpoint_manager, CheckPointManager) t = time.time() if checkpoint_manager.is_recovering: checkpoint_manager.restore_checkpoint() self.root_problem = \ checkpoint_manager.checkpoint_state.root_problem self.check_outputprefix() else: '''check input arguments''' self.check_options() '''build the problem structure''' self.root_problem = self.build_subproblems() '''build jobs''' self.build_jobs() '''connect jobs into a DAG''' self.connect_jobs() '''Queue up first level jobs (i.e. those with no dependency). Once these run, they should automatically enqueue the rest of the DAG through joins and callbacks ''' self.enqueue_firstlevel_job() '''start the checkpointing (has any effects only in checkpointing mode)''' checkpoint_manager.start_checkpointing(self.root_problem) '''Wait for all jobs to finish''' if (not JobPool().wait_for_all_jobs()): _LOG.exception( "There have been errors in executed jobs. Terminating.") sys.exit(1) ''' terminate The job pool and release memory''' JobPool().terminate() ''' Pause Checkpointing''' checkpoint_manager.pause_checkpointing() # checkpoint_manager.force_checkpoint() '''Merge results into final outputs''' self.merge_results() '''Output final results''' self.output_results() ''' Pause Checkpointing''' checkpoint_manager.stop_checkpointing() _LOG.info("Current execution Finished in %d seconds" % (time.time() - t)) _LOG.info( "All checkpointed executions Finished in %d cumulative time" % (checkpoint_manager.get_total_time()))
def __init__(self): ''' Constructor ''' self.root_problem = None self.results = None self.options = options() self.outchecked = False # for ease of access
def start_checkpointing(self, root_problem): if self.is_checkpointing: _LOG.info("Checkpoint every %d seconds" %options().checkpoint_interval) self.checkpoint_state.root_problem = root_problem self.checkpoint_state.temp_root = get_root_temp_dir() if self.checkpoint_state.cumulative_time is None: self.checkpoint_state.cumulative_time = 0 save_checkpoint(self)
def bin_blast_results(input): # Map the blast results to the markers gene_mapping = read_mapping( os.path.join( options().__getattribute__('reference').path, 'blast/%s/seq2marker.tab' % options().genes)) genes = {} with open(input) as f: for line in f: results = line.split('\t') gene = gene_mapping[results[1]][1] if gene in genes: genes[gene].append(results[0]) else: genes[gene] = [results[0]] return genes
def __init__(self): ''' Constructor ''' self.root_problem = None self.results = None self.options = options() # for ease of access pass
def bin_to_markers(input,temp_dir): fragments = MutableAlignment() fragments.read_filepath(input) if (options().gene == None): #First blast sequences against all markers blast_results=temp_dir+"/blast.out" print "Blasting fragments against marker dataset\n" blast_fragments(input,blast_results) #Next bin the blast hits to the best gene gene_binning = bin_blast_results(blast_results) else: gene_binning = {options().gene:fragments.keys()} #Now figure out direction of fragments binned_fragments = dict([(gene,dict([(seq_name,fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning]) for (gene,frags) in binned_fragments.items(): #Add reverse complement sequence frags_rev = dict([(name+'_rev',reverse_sequence(seq)) for (name,seq) in frags.items()]) gene_frags = MutableAlignment() gene_frags.set_alignment(frags) gene_frags.set_alignment(frags_rev) gene_file=temp_dir+"/%s.frags.fas" % gene _write_fasta(gene_frags,gene_file) #Now run HMMER search hmmer_search(gene_file,options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.profile'%gene,temp_dir+"/%s.out" % gene) results=read_hmmsearch_results(temp_dir+"/%s.out" % gene) #Now select best direction for each frag for key in frags: forward_score = -10000 backward_score = -10000 if (key in results): forward_score = results[key][1] if (key+"_rev" in results): backward_score = results[key+"_rev"][1] if (backward_score > forward_score): frags[key]=gene_frags[key+"_rev"] #Now write to file _write_fasta(frags,gene_file+".fixed") binned_fragments[gene]=frags return binned_fragments
def testCpuCount(self): # Just to make different test cases independent of each other. config._options_singelton = None # Disable main config path for this test config.main_config_path = self.fp_config JobPool().terminate() JobPool().__init__(7) sys.argv = [sys.argv[0], "-x", "7"] assert options().cpu == 7, "Commandline option -x not read properly" # clean up after test: # 1) the JobPool CPU counts needs to be reset to the default # 2) the command line arguments must be restored JobPool().terminate() JobPool().__init__(cpu_count()) sys.argv = [sys.argv[0], "-x", str(cpu_count())] config._options_singelton = None options()
def figureout_fragment_subset(self): ''' Figure out which fragment should go to which subproblem''' # We need to keep and check the following flag because of checkpoining scenarios (join already done before!) if "fragments.distribution.done" in self.root_problem.annotations: return bitscores = dict([(name, []) for name in list(self.root_problem.fragments.keys())]) for fragment_chunk_problem in self.root_problem.iter_leaves(): align_problem = fragment_chunk_problem.get_parent() assert isinstance(align_problem, SeppProblem) '''For each subproblem start with an empty set of fragments, and add to them as we encounter new best hits for that subproblem''' if align_problem.fragments is None: align_problem.fragments = MutableAlignment() search_res = fragment_chunk_problem.get_job_result_by_name("hmmsearch") for key in list(search_res.keys()): ''' keep a list of all hits, and their bit scores''' bitscores[key].append( (search_res[key][1], align_problem) ) for frag, tuplelist in bitscores.items(): ''' TODO: what to do with those that are not? For now, only output warning message''' #TODO: Need to double check and fix the math _LOG.warning("Fragment %s is not scored against any subset" %str(frag)) if len(tuplelist) == 0: _LOG.warning("Fragment %s is not scored against any subset" %str(frag)) continue ''' convert bit scores to probabilities ''' denum = sum(math.pow(2, min(x[0],1022)) for x in tuplelist) #_LOG.warning("Tuples: %s" %str(tuplelist)) tuplelist = [((math.pow(2,min(x[0],1022))/denum*1000000),x[1]) for x in tuplelist] ''' Sort subsets by their probability''' tuplelist.sort(reverse=True, key = lambda x: x[0]) ''' Find enough subsets to reach the threshold ''' selected = tuplelist[ 0 : max(1, reduce(lambda x, y: (x[0],None) if x[1] is None else (y[0],x[1]+y[1]) if x[1] < int(1000000 * self.alignment_threshold) else (y[0],None), enumerate([x[0] for x in tuplelist]))[0]) ] ''' Renormalize the selected list to add up to 1''' renorm = 0 for (prob,align_problem) in selected: renorm = renorm + prob/1000000 renorm = 1/renorm _LOG.debug("Fragment %s assigned to %d subsets" %(frag,len(selected))) ''' Rename the fragment and assign it to the respective subsets''' for (prob,align_problem) in selected: postfix = prob*renorm if options().exhaustive.weight_placement_by_alignment.lower() == "true" else 1000000 frag_rename = "%s_%s_%d" %(frag,align_problem.label,postfix) align_problem.fragments[frag_rename] = self.root_problem.fragments[frag] self.root_problem.annotations["fragments.distribution.done"] = 1
def testConfigFile(self): config._options_singelton = None # Just to make different test cases independent of each other. back = config.main_config_path config.main_config_path = os.path.expanduser("~/.sepp/main.config.notexistentfile") # Diasable main config path for this test sys.argv = [sys.argv[0], "-A" ,"2", "-c" ,"data/configs/test.config", "--outdir", "dir_form_commandline"] assert options().alignment_size == 2, "Commandline option -A not read properly" assert isinstance(options().config_file, file) and options().config_file.name == "data/configs/test.config", "Commandline option -c not read properly" assert (options().pplacer is not None and options().pplacer.path == "pplacer"), "config file options not read properly" assert options().placement_size == 10, "Config file option placementSize not read properly" assert options().outdir.endswith("dir_form_commandline"), "Config file value outdir is not properly overwritten:%s " %options().outdir assert options().tempdir is not None, "Default value not properly set for tempfile attribute" print options() config.main_config_path = back
def check_options(self, supply=[]): ''' This method should check the input values stored in config.option to make sure every necessary argument is provided, and that the provided values are all fine. In the event of recognizing invalid or missing input, an Exception (maybe an ArgumentError) should be raised. By default expects tree_file, raxml_file, and fragment_file. Overwrite if required. ''' if (options().tree_file is None): supply = supply + ["tree file"] if (options().alignment_file is None): supply = supply + ["alignment file"] if (options().fragment_file is None): supply = supply + ["fragment file"] if (len(supply) != 0): raise ValueError( "Failed to supply: %s\nRun with -h option to see a list of options." % " , ".join(supply)) self.check_outputprefix()
def blast_to_markers(input, temp_dir): fragments = MutableAlignment() fragments.read_filepath(input) if (options().gene is None): # First blast sequences against all markers blast_results = temp_dir + "/blast.out" if (options().blast_file is None): print("Blasting fragments against marker dataset\n") blast_fragments(input, blast_results) else: blast_results = options().blast_file # Next bin the blast hits to the best gene gene_binning = bin_blast_results(blast_results) else: gene_binning = {options().gene: list(fragments.keys())} # Now figure out direction of fragments binned_fragments = dict([(gene, dict([(seq_name, fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning]) print("Finding best orientation of reads\n") align_name = 'sate' if (options().genes == 'cogs'): align_name = 'pasta' for (gene, frags) in binned_fragments.items(): # Add reverse complement sequence frags_rev = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in frags.items()]) gene_frags = MutableAlignment() gene_frags.set_alignment(frags) gene_frags.set_alignment(frags_rev) gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(gene_frags, gene_file) # Now run HMMER search hmmer_search( gene_file, os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for key in frags: forward_score = -10000 backward_score = -10000 if (key in results): forward_score = results[key][1] if (key + "_rev" in results): backward_score = results[key + "_rev"][1] if (backward_score > forward_score): frags[key] = gene_frags[key + "_rev"] # Now write to file _write_fasta(frags, gene_file + ".fixed") binned_fragments[gene] = frags return binned_fragments
def figureout_fragment_subset(self): ''' Figure out which fragment should go to which subproblem''' # We need to keep and check the following flag because of checkpoining scenarios (join already done before!) if self.root_problem.annotations.has_key("fragments.distribution.done"): return bitscores = dict([(name, []) for name in self.root_problem.fragments.keys()]) for fragment_chunk_problem in self.root_problem.iter_leaves(): align_problem = fragment_chunk_problem.get_parent() assert isinstance(align_problem, SeppProblem) '''For each subproblem start with an empty set of fragments, and add to them as we encounter new best hits for that subproblem''' if align_problem.fragments is None: align_problem.fragments = MutableAlignment() search_res = fragment_chunk_problem.get_job_result_by_name("hmmsearch") for key in search_res.keys(): ''' keep a list of all hits, and their bit scores''' bitscores[key].append( (search_res[key][1], align_problem) ) for frag, tuplelist in bitscores.iteritems(): ''' TODO: what to do with those that are not? For now, only output warning message''' #TODO: Need to double check and fix the math if len(tuplelist) == 0: _LOG.warning("Fragment %s is not scored against any subset" %str(frag)) continue ''' convert bit scores to probabilities ''' denum = sum(math.pow(2, min(x[0],1022)) for x in tuplelist) tuplelist = [((math.pow(2,min(x[0],1022))/denum*1000000),x[1]) for x in tuplelist] ''' Sort subsets by their probability''' tuplelist.sort(reverse=True) ''' Find enough subsets to reach the threshold ''' selected = tuplelist[ 0 : max(1, reduce(lambda x, y: (x[0],None) if x[1] is None else (y[0],x[1]+y[1]) if x[1] < int(1000000 * self.alignment_threshold) else (y[0],None), enumerate([x[0] for x in tuplelist]))[0]) ] ''' Renormalize the selected list to add up to 1''' renorm = 0 for (prob,align_problem) in selected: renorm = renorm + prob/1000000 renorm = 1/renorm _LOG.debug("Fragment %s assigned to %d subsets" %(frag,len(selected))) ''' Rename the fragment and assign it to the respective subsets''' for (prob,align_problem) in selected: postfix = prob*renorm if options().exhaustive.weight_placement_by_alignment.lower() == "true" else 1000000 frag_rename = "%s_%s_%d" %(frag,align_problem.label,postfix) align_problem.fragments[frag_rename] = self.root_problem.fragments[frag] self.root_problem.annotations["fragments.distribution.done"] = 1
def testConfigFileMissingFile(self): config._options_singelton = None # Just to make different test cases independent of each other. back = config.main_config_path config.main_config_path = os.path.expanduser("~/.sepp/main.config.notexistentfile") # Diasable main config path for this test sys.argv = [sys.argv[0], "-c" ,"data/configs/test2.config", "-f", "data/simulated/test.fas"] assert isinstance(options().config_file, file) and options().config_file.name == "data/configs/test2.config", "Commandline option -c not read properly" assert isinstance(options().alignment_file, file) and options().alignment_file.name == "data/simulated/test.small.fas", "Config file option alignment not read properly" assert isinstance(options().fragment_file, file) and options().fragment_file.name == "data/simulated/test.fas", "Command-line option -f alignment not read properly" print options() config.main_config_path = back
def bin_blast_results(input): #Map the blast results to the markers gene_mapping = read_mapping(os.path.join(options().__getattribute__('reference').path, 'blast/%s/seq2marker.tab' % options().genes)) genes = {} with open(input) as f: for line in f: results = line.split('\t') gene = gene_mapping[results[1]][1]; if gene in genes: genes[gene].append(results[0]) else: genes[gene] = [results[0]] return genes
def bin_blast_results(input): global refpkg # Map the blast results to the markers gene_mapping = read_mapping(refpkg["blast"]["seq-to-marker-map"]) hitinfo = {} with open(input) as f: # BLAST output contains reads sorted in ascending order by bitscore for line in f: results = line.split('\t') qseqid = results[0] sseqid = results[1] # pident = float(results[2]) # length = int(results[3]) # mismatch = int(results[4]) # gapopen = int(results[5]) qstart = int(results[6]) qend = int(results[7]) qlen = int(results[8]) sstart = int(results[9]) send = int(results[10]) slen = int(results[11]) # evalue = float(results[12]) # bitscore = float(results[13].strip()) qcov = abs(qend - qstart) + 1 update = False if qcov >= options().blast_threshold: try: if hitinfo[qseqid]["qcov"] < qcov: update = True except KeyError: hitinfo[qseqid] = {} update = True if update: hitinfo[qseqid]["sseqid"] = sseqid hitinfo[qseqid]["gene"] = gene_mapping[sseqid][1] hitinfo[qseqid]["qstart"] = qstart hitinfo[qseqid]["qend"] = qend hitinfo[qseqid]["qlen"] = qlen hitinfo[qseqid]["sstart"] = sstart hitinfo[qseqid]["send"] = send hitinfo[qseqid]["slen"] = slen hitinfo[qseqid]["qcov"] = qcov return hitinfo
def check_options(self, supply=[]): ''' This method should check the input values stored in config.option to make sure every necessary argument is provided, and that the provided values are all fine. In the event of recognizing invalid or missing input, a an Exception (maybe an ArgumentError) should be raised. By default expects tree_file, raxml_file, and fragment_file. Overwrite if required. ''' if (options().tree_file is None): supply = supply + ["tree file"] if (options().alignment_file is None): supply = supply + ["alignment file"] if (options().fragment_file is None): supply = supply + ["fragment file"] if (len(supply) != 0): raise ValueError ("Failed to supply: %s\nRun with -h option to see a list of options." % " , ".join(supply)) if (options().info_file is None): supply = supply + ["raxml file"]; self.check_outputprefix() pass
def blast_to_markers(input, temp_dir): fragments = MutableAlignment() fragments.read_filepath(input) if options().gene == None: # First blast sequences against all markers blast_results = temp_dir + "/blast.out" if options().blast_file == None: print "Blasting fragments against marker dataset\n" blast_fragments(input, blast_results) else: blast_results = options().blast_file # Next bin the blast hits to the best gene gene_binning = bin_blast_results(blast_results) else: gene_binning = {options().gene: fragments.keys()} # Now figure out direction of fragments binned_fragments = dict( [(gene, dict([(seq_name, fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning] ) print "Finding best orientation of reads\n" align_name = "sate" if options().genes == "cogs": align_name = "pasta" for (gene, frags) in binned_fragments.items(): # Add reverse complement sequence frags_rev = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in frags.items()]) gene_frags = MutableAlignment() gene_frags.set_alignment(frags) gene_frags.set_alignment(frags_rev) gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(gene_frags, gene_file) # Now run HMMER search hmmer_search( gene_file, os.path.join(options().__getattribute__("reference").path, "refpkg/%s.refpkg/%s.hmm" % (gene, align_name)), temp_dir + "/%s.out" % gene, ) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for key in frags: forward_score = -10000 backward_score = -10000 if key in results: forward_score = results[key][1] if key + "_rev" in results: backward_score = results[key + "_rev"][1] if backward_score > forward_score: frags[key] = gene_frags[key + "_rev"] # Now write to file _write_fasta(frags, gene_file + ".fixed") binned_fragments[gene] = frags return binned_fragments
def __init__(self, **kwargs): self.job_type = 'jsonmerger' ExternalSeppJob.__init__(self, self.job_type, **kwargs) self.out_file = None self.taxonomy = None self.mapping = None self.threshold = None self.classification_file = None self.elim = float(options().hmmsearch.elim) self.filters = True if options().hmmsearch.filters.upper( ) == "TRUE" else False if options().hmmsearch.filters.upper( ) == "FALSE" else None if self.filters is None: raise Exception( "Expecting true/false for options().hmmsearch.filters") self.strategy = options().exhaustive.strategy self.minsubsetsize = int(options().exhaustive.minsubsetsize) self.alignment_threshold = float(options().alignment_threshold) #Temp fix for now, self.molecule = options().molecule self.placer = options().exhaustive.__dict__['placer'].lower()
def save_checkpoint(checkpoint_manager): ''' This is the callback function that is called periodically to save the current state of the system. ''' # Note: this module is not bullet proof in terms of race conditions. # Most importantly, it is possible (though extremely unlikely) that # while the new temp path is being written (f.write...) if checkpoint_manager.is_checkpointing: # checkpoint_manager.lock.acquire() checkpoint_manager.saving = True newTmpDest = get_temp_file("dump", "checkpoints") _LOG.info("Checkpoint is being updated: %s" % newTmpDest) oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines() oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split( ",")[0] checkpoint_manager.update_time() currenlimit = sys.getrecursionlimit() sys.setrecursionlimit(100000) picklefile = gzip.GzipFile(newTmpDest, 'wb') pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2) picklefile.close() sys.setrecursionlimit(currenlimit) f = open(checkpoint_manager.checkpoint_path, "a") f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now())) f.close() if oldTmpFile is not None: os.remove(oldTmpFile) _LOG.info("Checkpoint Saved to: %s and linked in %s." % (newTmpDest, checkpoint_manager.checkpoint_path)) checkpoint_manager.saving = False # checkpoint_manager.lock.release() checkpoint_manager.timer = threading.Timer( options().checkpoint_interval, save_checkpoint, args=[checkpoint_manager]) checkpoint_manager.timer.setDaemon(True) checkpoint_manager.timer.start()
def save_checkpoint(checkpoint_manager): ''' This is the callback function that is called periodically to save the current state of the system. ''' # Note: this module is not bullet proof in terms of race conditions. # Most importantly, it is possible (though extremely unlikely) that # while the new temp path is being written (f.write...) if checkpoint_manager.is_checkpointing: # checkpoint_manager.lock.acquire() checkpoint_manager.saving = True newTmpDest = get_temp_file("dump", "checkpoints") _LOG.info("Checkpoint is being updated: %s" % newTmpDest) oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines() oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split( ",")[0] checkpoint_manager.update_time() currenlimit = sys.getrecursionlimit() sys.setrecursionlimit(100000) picklefile = gzip.GzipFile(newTmpDest, 'wb') pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2) picklefile.close() sys.setrecursionlimit(currenlimit) f = open(checkpoint_manager.checkpoint_path, "a") f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now())) f.close() if oldTmpFile is not None: os.remove(oldTmpFile) _LOG.info("Checkpoint Saved to: %s and linked in %s." % ( newTmpDest, checkpoint_manager.checkpoint_path)) checkpoint_manager.saving = False # checkpoint_manager.lock.release() checkpoint_manager.timer = threading.Timer( options().checkpoint_interval, save_checkpoint, args=[checkpoint_manager]) checkpoint_manager.timer.setDaemon(True) checkpoint_manager.timer.start()
def testConfigFileMissingFile(self): config._options_singelton = None # Just to make different test cases independent of each other. back = config.main_config_path config.main_config_path = os.path.expanduser( "~/.sepp/main.config.notexistentfile" ) # Diasable main config path for this test sys.argv = [ sys.argv[0], "-c", "data/configs/test2.config", "-f", "data/simulated/test.fas" ] assert isinstance(options().config_file, file) and options( ).config_file.name == "data/configs/test2.config", "Commandline option -c not read properly" assert isinstance(options().alignment_file, file) and options( ).alignment_file.name == "data/simulated/test.small.fas", "Config file option alignment not read properly" assert isinstance(options().fragment_file, file) and options( ).fragment_file.name == "data/simulated/test.fas", "Command-line option -f alignment not read properly" print options() config.main_config_path = back
def testConfigFileMissingFile(self): # Just to make different test cases independent of each other. config._options_singelton = None # Diasable main config path for this test config.main_config_path = self.fp_config sys.argv = [sys.argv[0], "-c", get_data_path("configs/test2.config"), "-f", get_data_path("simulated/test.fas"), "-a", get_data_path("simulated/test.small.fas")] assert isinstance(options().config_file, filetypes) and \ options().config_file.name.endswith( "data/configs/test2.config"), \ "Commandline option -c not read properly" assert isinstance(options().alignment_file, filetypes) and\ options().alignment_file.name.endswith( "data/simulated/test.small.fas"), \ "Config file option alignment not read properly" assert isinstance(options().fragment_file, filetypes) and\ options().fragment_file.name.endswith( "data/simulated/test.fas"), \ "Command-line option -f alignment not read properly"
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = ( seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] (min_length, max_length) = ( int(options().median_full_length * ( 1 - options().backbone_threshold)), int(options().median_full_length*( 1 + options().backbone_threshold))) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): _LOG.info( "Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted(random.sample( sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def testMainConfigFile(self): # Just to make different test cases independent of each other. config._options_singelton = None sys.argv = [sys.argv[0], "-c", get_data_path("configs/test2.config")] # set pplacer filepath to a file shipped with the code base options().pplacer.path = get_data_path( "../../../tools/bundled/Darwin/pplacer") assert (options().pplacer is not None and os.path.exists( options().pplacer.path)), \ ("main config file options not read properly," "or nonexistent binaries: pplacer = %s" % options().pplacer.path) options().hmmalign.path = get_data_path( "../../../tools/bundled/Darwin/hmmalign") assert (options().hmmalign is not None and os.path.exists( options().hmmalign.path)), \ ("main config file options not read properly, or nonexistent " "binaries: hmmalign = %s" % options().hmmalign.path) options().hmmsearch.path = get_data_path( "../../../tools/bundled/Darwin/hmmsearch") assert (options().hmmsearch is not None and os.path.exists( options().hmmsearch.path)), \ ("main config file options not read properly, or nonexistent " "binaries: hmmsearch = %s" % options().hmmsearch.path)
def build_profile(input, output_directory): global taxon_map, level_map, key_map, levels temp_dir = tempfile.mkdtemp(dir=options().__getattribute__('tempdir')) if (options().bin == 'blast'): binned_fragments = blast_to_markers(input, temp_dir) else: binned_fragments = hmmer_to_markers(input, temp_dir) if binned_fragments: print("Finished binning") else: print("Unable to bin any fragments!\n") return # load up taxonomy for 30 marker genes if (options().genes == 'markers'): (taxon_map, level_map, key_map) = load_taxonomy( os.path.join(options().reference.path, 'refpkg/rpsB.refpkg/all_taxon.taxonomy')) else: (taxon_map, level_map, key_map) = load_taxonomy( os.path.join(options().reference.path, 'refpkg/COG0012.refpkg/all_taxon.taxonomy')) # all classifications stored here classifications = {} classification_files = [] # Now run TIPP on each fragment gene_name = 'sate' if (options().genes == 'cogs'): gene_name = 'pasta' for (gene, frags) in binned_fragments.items(): # Get size of each marker total_taxa = 0 with open( os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.size' % (gene, gene_name)), 'r') as f: total_taxa = int(f.readline().strip()) decomp_size = options().alignment_size if (decomp_size > total_taxa): decomp_size = int(total_taxa / 2) cpus = options().cpu if (len(frags) < cpus): cpus = len(frags) extra = '' if options().dist is True: extra = '-D' if options().max_chunk_size is not None: extra = extra + '-F %d' % options().max_chunk_size if options().cutoff != 0: extra = extra + " -C %f" % options().cutoff print( ('Cmd:\nrun_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a ' '%s -r %s -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s ' '-o %s -d %s %s') % (options().config_file.name, cpus, options().molecule, temp_dir + "/%s.frags.fas.fixed" % gene, os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.taxonomy' % (gene, gene_name)), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.tree' % (gene, gene_name)), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.fasta' % (gene, gene_name)), os.path.join( options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.taxonomy.RAxML_info' % (gene, gene_name)), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/all_taxon.taxonomy' % gene), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/species.mapping' % gene), options().alignment_threshold, 0, decomp_size, total_taxa, temp_dir + "/temp_file", "tipp_%s" % gene, output_directory + "/markers/", extra)) os.system( ('run_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a %s -r %s' ' -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s -o %s -d ' '%s %s') % (options().config_file.name, cpus, options().molecule, temp_dir + "/%s.frags.fas.fixed" % gene, os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.taxonomy' % (gene, gene_name)), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.tree' % (gene, gene_name)), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.fasta' % (gene, gene_name)), os.path.join( options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.taxonomy.RAxML_info' % (gene, gene_name)), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/all_taxon.taxonomy' % gene), os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/species.mapping' % gene), options().alignment_threshold, 0, decomp_size, total_taxa, temp_dir + "/temp_file", "tipp_%s" % gene, output_directory + "/markers/", extra)) if (not os.path.exists(output_directory + "/markers/tipp_%s_classification.txt" % gene)): continue gene_classification = generate_classification( output_directory + "/markers/tipp_%s_classification.txt" % gene, options().placement_threshold) classification_files.append(output_directory + "/markers/tipp_%s_classification.txt" % gene) # Now write individual classification and also pool classifications write_classification( gene_classification, output_directory + "/markers/tipp_%s.classification" % gene) classifications.update(gene_classification) remove_unclassified_level(classifications) write_classification(classifications, output_directory + "/markers/all.classification") write_abundance(classifications, output_directory) if (options().dist is True): distribution(classification_files, output_directory)
def testConfigFile(self): config._options_singelton = None # Just to make different test cases independent of each other. back = config.main_config_path config.main_config_path = os.path.expanduser( "~/.sepp/main.config.notexistentfile" ) # Diasable main config path for this test sys.argv = [ sys.argv[0], "-A", "2", "-c", "data/configs/test.config", "--outdir", "dir_form_commandline" ] assert options( ).alignment_size == 2, "Commandline option -A not read properly" assert isinstance(options().config_file, file) and options( ).config_file.name == "data/configs/test.config", "Commandline option -c not read properly" assert (options().pplacer is not None and options().pplacer.path == "pplacer"), "config file options not read properly" assert options( ).placement_size == 10, "Config file option placementSize not read properly" assert options().outdir.endswith( "dir_form_commandline" ), "Config file value outdir is not properly overwritten:%s " % options( ).outdir assert options( ).tempdir is not None, "Default value not properly set for tempfile attribute" print options() config.main_config_path = back
def check_options(self, supply=[]): if (options().info_file is None): supply = supply + ["raxml file"] AbstractAlgorithm.check_options(self, supply)
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid") exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" %(options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size return ExhaustiveAlgorithm.check_options(self)
def check_options(self, supply=[]): if (options().taxonomy_file is None): supply = supply + ["taxonomy file"] if (options().taxonomy_name_mapping_file is None): supply = supply + ["taxonomy name mapping file"] ExhaustiveAlgorithm.check_options(self, supply)
def build_profile(input, output_directory): global taxon_map, level_map, key_map, levels, refpkg temp_dir = tempfile.mkdtemp(dir=options().__getattribute__('tempdir')) # New option to allow fastq files as input input = to_fasta(input, temp_dir) if (options().bin == "hmmer"): binned_fragments = hmmer_to_markers(input, temp_dir) else: binned_fragments = blast_to_markers(input, temp_dir) for gene in refpkg["genes"]: try: if binned_fragments[gene]["nfrags"] == 0: del binned_fragments[gene] except KeyError: pass if options().gene is not None: keep = set(options().gene.split(',')) for gene in refpkg["genes"]: if gene not in keep: try: del binned_fragments[gene] print("Removed reads that hit %s" % gene) except KeyError: pass if binned_fragments: print("Finished binning") else: print("Unable to bin any fragments!\n") with open(output_directory + "/abundance.phylum.csv", 'w') as f: f.write("Unable to create an abundance profile, because" " none of the input sequences mapped to the" " marker gene(s).") return # Load up taxonomy for marker genes (taxon_map, level_map, key_map) = \ load_taxonomy(refpkg["taxonomy"]["taxonomy"]) # Store all classifications here classifications = {} classification_files = [] # Run TIPP on each fragment for gene in binned_fragments.keys(): # Set placement subset size to equal the size of each marker with open(refpkg[gene]["size"], 'r') as f: total_taxa = int(f.readline().strip()) default_subset_size = int(total_taxa * 0.10) # Set alignment size and placement size alignment_size = options().alignment_size placement_size = options().placement_size if alignment_size is None: if placement_size is None: alignment_size = default_subset_size else: alignment_size = placement_size if placement_size is None: # placement_size = max(default_subset_size, alignment_size) placement_size = 10000 # Needs to be large if alignment_size > total_taxa: alignment_size = total_taxa if placement_size > total_taxa: placement_size = total_taxa if alignment_size != placement_size: if placement_size < total_taxa: sys.exit("Alignment decomposition tree can be different from" " placement tree only if the placement subset size" " is set to the number of taxa") if (refpkg[gene]["alignment-decomposition-tree"] == refpkg[gene]["placement-tree"]) or \ (placement_size == total_taxa): pass else: print("Alignment decomposition tree can be different from" " placement tree only if the placement subset size" " is set to the number of taxa" " (note: marker %s has %d taxa)" % (gene, total_taxa)) return # Set number of CPUS cpus = options().cpu if binned_fragments[gene]["nfrags"] < cpus: cpus = binned_fragments[gene]["nfrags"] # Set extra arguments extra = '' if options().dist is True: extra = "-D" if options().max_chunk_size is not None: extra = extra + "-F %d" % options().max_chunk_size if options().cutoff != 0: extra = extra + " -C %f" % options().cutoff cmd = "run_tipp.py " \ + " -c " + tipp_config_path \ + " --cpu " + str("%d" % cpus) \ + " -m " + options().molecule \ + " -f " + binned_fragments[gene]["file"] \ + " -t " + refpkg[gene]["placement-tree"] \ + " -adt " + refpkg[gene]["alignment-decomposition-tree"] \ + " -a " + refpkg[gene]["alignment"] \ + " -r " + refpkg[gene]["raxml-info-for-placement-tree"] \ + " -tx " + refpkg["taxonomy"]["taxonomy"] \ + " -txm " + refpkg[gene]["seq-to-taxid-map"] \ + " -at " + str("%0.2f" % options().alignment_threshold) \ + " -pt 0.0" \ + " -A " + str("%d" % alignment_size) \ + " -P " + str("%d" % placement_size) \ + " -p " + temp_dir + "/temp_file" \ + " -o tipp_" + gene \ + " -d " + output_directory + "/markers/ " \ + extra print(cmd) os.system(cmd) tipp_output = output_directory + "/markers/tipp_" + gene \ + "_classification.txt" if (not os.path.exists(tipp_output)): continue classification_files.append(tipp_output) gene_classification = generate_classification( tipp_output, options().placement_threshold) # Apply placement threshold to classification data gene_classification_output = output_directory \ + "/markers/tipp_" + gene + "_classification_" \ + str("%0.2f" % options().placement_threshold) + ".txt" gene_classification = generate_classification( tipp_output, options().placement_threshold) write_classification(gene_classification, gene_classification_output) # Pool classification classifications.update(gene_classification) remove_unclassified_level(classifications) write_classification(classifications, output_directory + "/markers/all.classification") write_abundance(classifications, output_directory) if (options().dist is True): distribution(classification_files, output_directory)
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def check_options(self): self.check_outputprefix() options().info_file = "A_dummy_value" # Check to see if tree/alignment/fragment file provided, if not, # generate it from sequence file if ((not options().tree_file is None) and (not options().alignment_file is None) and (not options().sequence_file is None)): options().fragment_file = options().sequence_file elif ((options().tree_file is None) and (options().alignment_file is None) and (not options().sequence_file is None)): self.generate_backbone() else: _LOG.error( ("Either specify the backbone alignment and tree and query " "sequences or only the query sequences. Any other " "combination is invalid")) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( ("Backbone parameter needs to match actual size of backbone; " "backbone parameter:%s backbone_size:%s") % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().backtranslation_sequence_file and \ options().molecule != "amino": _LOG.error(("Backtranslation can be performed only when " "input sequences are amino acid. ")) exit(-1) return ExhaustiveAlgorithm.check_options(self)
def testMainConfigFile(self): config._options_singelton = None # Just to make different test cases independent of each other. sys.argv = [sys.argv[0]] assert ( options().pplacer is not None and os.path.exists(options().pplacer.path) ), ("main config file" "options not read properly, or nonexistent binaries: pplacer = %s" % options().pplacer.path) assert ( options().hmmalign is not None and os.path.exists(options().hmmalign.path) ), ("main config file" "options not read properly, or nonexistent binaries: hmmalign = %s" % options().pplacer.path) assert ( options().hmmsearch is not None and os.path.exists(options().hmmalign.path) ), ("main config file" "options not read properly, or nonexistent binaries: hmmsearch = %s" % options().pplacer.path) print options()
def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted([len(seq) for seq in sequences.values()]) lengths = len(seq_lengths) if lengths % 2: options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0 else: options().median_full_length = seq_lengths[lengths / 2] (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold))) frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in fragments.keys()] if (options().backbone_size is None): options().backbone_size = min(1000,int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) if (options().backbone_size > len(sequences.keys())): options().backbone_size = len(sequences.keys()) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) pastaalignJob.run() pastaalignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/pasta.fasta") options().tree_file = open(self.options.outdir + "/pasta.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): _LOG.info("No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta")) sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def check_options(self): options().info_file = "A_dummy_value" #Check to see if tree/alignment/fragment file provided, if not, generate it #from sequence file if not options().tree_file is None and not options( ).alignment_file is None and not options().sequence_file is None: options().fragment_file = options().sequence_file elif options().tree_file is None and options( ).alignment_file is None and not options().sequence_file is None: self.generate_backbone() else: _LOG.error( "Either specify the backbone alignment and tree and query sequences or only the query sequences. Any other combination is invalid" ) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( "Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().alignment_size is None: _LOG.info( "Alignment subset size not given. Calculating subset size. ") alignment = MutableAlignment() alignment.read_file_object(open(self.options.alignment_file.name)) if (options().molecule == 'amino'): _LOG.warning( "Automated alignment subset selection not implemented for protein alignment. Setting to 10." ) options().alignment_size = 10 else: (averagep, maxp) = alignment.get_p_distance() align_size = 10 if (averagep > .60): while (align_size * 2 < alignment.get_num_taxa()): align_size = align_size * 2 _LOG.info( "Average p-distance of backbone is %f0.2. Alignment subset size set to %d. " % (averagep, align_size)) options().alignment_size = align_size return ExhaustiveAlgorithm.check_options(self)
def blast_fragments(input, output): '''Blast the fragments against all marker genes+16S sequences, return output ''' os.system('%s -db %s -outfmt 6 -query %s -out %s -num_threads %d -max_target_seqs 1 ' % (options().__getattribute__('blast').path, os.path.join(options().__getattribute__('reference').path, "blast/%s/alignment.fasta.db" % options().genes), input, output,options().cpu))
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100, int(.20 * sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment( random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone, options().backbone_size, self.options.outdir, moleculeType, options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def build_profile(input, output_directory): global taxon_map, level_map, key_map, levels temp_dir = tempfile.mkdtemp(dir=options().__getattribute__('tempdir')) binned_fragments = bin_to_markers(input, temp_dir) #load up taxonomy for 30 marker genes (taxon_map, level_map, key_map) = load_taxonomy(options().__getattribute__('reference').path + 'refpkg/rpsB.refpkg/all_taxon.taxonomy') #all classifications stored here classifications = {} #Now run TIPP on each fragment for (gene, frags) in binned_fragments.items(): #Get size of each marker total_taxa = 0 with open( options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.size' % gene, 'r') as f: total_taxa = int(f.readline().strip()) decomp_size = options().alignment_size if (decomp_size > total_taxa): decomp_size = int(total_taxa / 2) cpus = options().cpu if (len(frags.keys()) < cpus): cpus = len(frags.keys()) os.system( 'run_tipp.py -c %s --cpu %s -m %s -f %s -t %s -adt %s -a %s -r %s -tx %s -txm %s -at %0.2f -pt %0.2f -A %d -P %d -p %s -o %s -d %s' % (options().config_file.name, cpus, options().molecule, temp_dir + "/%s.frags.fas.fixed" % gene, options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.taxonomy' % gene, options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.tree' % gene, options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.fasta' % gene, options().__getattribute__('reference').path + 'refpkg/%s.refpkg/sate.taxonomy.RAxML_info' % gene, options().__getattribute__('reference').path + 'refpkg/%s.refpkg/all_taxon.taxonomy' % gene, options().__getattribute__('reference').path + 'refpkg/%s.refpkg/species.mapping' % gene, options().alignment_threshold, options().placement_threshold, decomp_size, total_taxa, temp_dir + "/temp_file", "tipp_%s" % gene, output_directory + "/markers/")) if (not os.path.exists(output_directory + "/markers/tipp_%s_classification.txt" % gene)): continue gene_classification = generate_classification( output_directory + "/markers/tipp_%s_classification.txt" % gene, 0) #Now write individual classification and also pool classifications write_classification( gene_classification, output_directory + "/markers/tipp_%s.classification.0" % gene) classifications.update(gene_classification) remove_unclassified_level(classifications) write_classification(classifications, output_directory + "/markers/all.classification.0") write_abundance(classifications, output_directory)
def hmmer_search(input, hmmer, output): '''Blast the fragments against all marker genes+16S sequences, return output''' os.system('%s --noali -E 10000 --cpu %d -o %s %s %s' % (options().__getattribute__('hmmsearch').path, options().cpu, output, hmmer, input))
def build_subproblems(self): (alignment, tree) = self.read_alignment_and_tree() if options().distance != 1: self.compute_distances(alignment) assert isinstance(tree, PhylogeneticTree) assert isinstance(alignment, MutableAlignment) tree.get_tree().resolve_polytomies() # Label edges with numbers so that we could assemble things back # at the end tree.lable_edges() ''' Make sure size values are set, and are meaningful. ''' self.check_and_set_sizes(alignment.get_num_taxa()) self._create_root_problem(tree, alignment) ''' Decompose the tree based on placement subsets''' placement_tree_map = PhylogeneticTree(Tree( tree.den_tree)).decompose_tree( self.options.placement_size, strategy=self.strategy, minSize=self.options.placement_size / int(self.options.exhaustive.placementminsubsetsizefacotr), tree_map={}, pdistance=1, decomp_strategy=self.decomp_strategy, distances=self.distances, maxDiam=None) assert len(placement_tree_map) > 0, ( "Tree could not be decomposed" " given the following settings; strategy:%s minsubsetsize:%s" " placement_size:%s" % (self.strategy, self.minsubsetsize, self.options.placement_size)) _LOG.info("Breaking into %d placement subsets." % len(placement_tree_map)) ''' For placement subsets create a placement subproblem, and decompose further''' for (p_key, p_tree) in placement_tree_map.items(): assert isinstance(p_tree, PhylogeneticTree) placement_problem = SeppProblem(p_tree.leaf_node_names(), self.root_problem) placement_problem.subtree = p_tree placement_problem.label = "P_%s" % str(p_key) _LOG.debug( "Placement subset %s has %d nodes" % (placement_problem.label, len(p_tree.leaf_node_names()))) ''' Further decompose to alignment subsets ''' alignment_tree_map = PhylogeneticTree(Tree( p_tree.den_tree)).decompose_tree( self.options.alignment_size, strategy=self.strategy, minSize=self.minsubsetsize, tree_map={}, decomp_strategy=self.options.decomp_strategy, pdistance=options().distance, distances=self.distances, maxDiam=self.options.maxDiam) assert len(alignment_tree_map) > 0, ( "Tree could not be decomposed" " given the following settings; strategy:%s" " minsubsetsize:%s alignmet_size:%s" % (self.strategy, self.minsubsetsize, self.options.alignment_size)) _LOG.debug("Placement subset %s has %d alignment subsets: %s" % (placement_problem.label, len(alignment_tree_map), str(sorted(alignment_tree_map.keys())))) _LOG.debug("Placement subset %s has %d taxa:" % (placement_problem.label, sum([ len(a_tree.leaf_node_names()) for a_tree in alignment_tree_map.values() ]))) for (a_key, a_tree) in alignment_tree_map.items(): assert isinstance(a_tree, PhylogeneticTree) self.modify_tree(a_tree) alignment_problem = SeppProblem(a_tree.leaf_node_names(), placement_problem) alignment_problem.subtree = a_tree alignment_problem.label = "A_%s_%s" % (str(p_key), str(a_key)) _LOG.info("Breaking into %d alignment subsets." % (len(list(self.root_problem.iter_leaves())))) ''' Divide fragments into chunks, to help achieve better parallelism''' fragment_chunk_files = self.create_fragment_files() self.root_problem.fragment_chunks = len(fragment_chunk_files) for alignment_problem in self.root_problem.iter_leaves(): for afc in range(0, self.root_problem.fragment_chunks): frag_chunk_problem = SeppProblem(alignment_problem.taxa, alignment_problem) frag_chunk_problem.subtree = alignment_problem.subtree frag_chunk_problem.label = alignment_problem.label.replace( "A_", "FC_") + "_" + str(afc) frag_chunk_problem.fragments = fragment_chunk_files[afc] _LOG.info("Breaking each alignment subset into %d fragment chunks." % self.root_problem.fragment_chunks) _LOG.debug("Subproblem structure: %s" % str(self.root_problem)) return self.root_problem
def blast_to_markers(input, temp_dir): """ Function based on: https://github.com/shahnidhi/tipp2_scripts/blob/master/get_marker_assignment.py """ global refpkg # Handle input with open(input, 'r') as fp: line = fp.readline() if line[0] != '>': sys.exit("%s is not a FASTA file; please reformat for BLAST." % input) if len(line.split(" ")) > 1: sys.exit("%s contains spaces; please reformat for BLAST." % input) # First blast sequences against all markers blast_results = temp_dir + "/blast.out" if (options().blast_file is None): print("Blasting fragments against marker dataset\n") blast_fragments(input, blast_results) else: blast_results = options().blast_file # Next bin the blast hits to the best gene hitinfo = bin_blast_results(blast_results) binned_fragments = {} for gene in refpkg["genes"]: binned_fragments[gene] = {} binned_fragments[gene]["file"] = temp_dir + '/' + gene \ + ".frags.fas.fixed" binned_fragments[gene]["fptr"] = \ open(binned_fragments[gene]["file"], 'w') binned_fragments[gene]["nfrags"] = 0 f = open(temp_dir + "/blast-binned.out", 'w') f.write("qseqid,sseqid,marker,trim_qstart,trim_qend,qlen\n") # if input.lower().endswith((".fastq", ".fq")): # fiter = fastq_iter(input) # elif input.lower().endswith((".fasta", ".fas", ".fa", ".fna")): fiter = fasta_iter(input) for ff in fiter: header = ff[0] seq = ff[1] found = True try: gene = hitinfo[header]["gene"] except KeyError: found = False if found: sseqid = hitinfo[header]["sseqid"] qstart = hitinfo[header]["qstart"] qend = hitinfo[header]["qend"] qlen = hitinfo[header]["qlen"] sstart = hitinfo[header]["sstart"] send = hitinfo[header]["send"] slen = hitinfo[header]["slen"] trim_qstart = 0 trim_qend = qlen if not options().no_trim: extra_qstart = qstart - 1 extra_qend = qlen - qend if sstart < send: extra_sstart = sstart - 1 extra_send = slen - send else: extra_sstart = slen - sstart extra_send = send - 1 if extra_qstart > 2 * extra_sstart: trim_qstart = qstart - 1 seq = seq[trim_qstart:] if extra_qend > 2 * extra_send: trim_qend = qend seq = seq[:trim_qend] if sstart > send: seq = reverse_sequence(seq) binned_fragments[gene]["fptr"].write('>' + header + '\n') binned_fragments[gene]["fptr"].write(seq + '\n') binned_fragments[gene]["nfrags"] += 1 f.write(header + ',' + sseqid + ',' + gene + ',' + str(trim_qstart + 1) + ',' + str(trim_qend) + ',' + str(qlen) + '\n') for gene in refpkg["genes"]: binned_fragments[gene]["fptr"].close() f.close() return binned_fragments