def hmmer_to_markers(input, temp_dir): global refpkg fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir + "/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, 'NA', 'NA']) for name in fragments.keys()]) for gene in refpkg["genes"]: # Now run HMMER search hmmer_output = temp_dir + '/' + gene + ".out" hmmer_search(frag_file, refpkg[gene]["hmm"], hmmer_output) results = read_hmmsearch_results(hmmer_output) # Now select best direction for each frag for name, value in results.items(): bitscore = value[1] direction = 'forward' true_name = name if (name.find('_rev') != -1): true_name = true_name.replace('_rev', '') direction = 'reverse' if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name, val in frag_scores.items(): if (val[1] not in genes): genes[val[1]] = {} if (val[2] == 'forward'): genes[val[1]][name] = fragments[name] else: genes[val[1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene, seq in genes.items(): gene_file = temp_dir + '/' + gene + ".frags.fas.fixed" _write_fasta(seq, gene_file) binned_fragments = {} for gene, seq in genes.items(): binned_fragments[gene] = {} binned_fragments[gene]["file"] = temp_dir + '/' + gene \ + ".frags.fas.fixed" binned_fragments[gene]["nfrags"] = len(seq.keys()) return binned_fragments
def blast_to_markers(input, temp_dir): fragments = MutableAlignment() fragments.read_filepath(input) if (options().gene is None): # First blast sequences against all markers blast_results = temp_dir + "/blast.out" if (options().blast_file is None): print("Blasting fragments against marker dataset\n") blast_fragments(input, blast_results) else: blast_results = options().blast_file # Next bin the blast hits to the best gene gene_binning = bin_blast_results(blast_results) else: gene_binning = {options().gene: list(fragments.keys())} # Now figure out direction of fragments binned_fragments = dict([ (gene, dict([(seq_name, fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning]) print("Finding best orientation of reads\n") align_name = 'sate' if (options().genes == 'cogs'): align_name = 'pasta' for (gene, frags) in binned_fragments.items(): # Add reverse complement sequence frags_rev = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in frags.items()]) gene_frags = MutableAlignment() gene_frags.set_alignment(frags) gene_frags.set_alignment(frags_rev) gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(gene_frags, gene_file) # Now run HMMER search hmmer_search( gene_file, os.path.join( options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for key in frags: forward_score = -10000 backward_score = -10000 if (key in results): forward_score = results[key][1] if (key+"_rev" in results): backward_score = results[key + "_rev"][1] if (backward_score > forward_score): frags[key] = gene_frags[key + "_rev"] # Now write to file _write_fasta(frags, gene_file + ".fixed") binned_fragments[gene] = frags return binned_fragments
def blast_to_markers(input, temp_dir): fragments = MutableAlignment() fragments.read_filepath(input) if (options().gene is None): # First blast sequences against all markers blast_results = temp_dir + "/blast.out" if (options().blast_file is None): print("Blasting fragments against marker dataset\n") blast_fragments(input, blast_results) else: blast_results = options().blast_file # Next bin the blast hits to the best gene gene_binning = bin_blast_results(blast_results) else: gene_binning = {options().gene: list(fragments.keys())} # Now figure out direction of fragments binned_fragments = dict([(gene, dict([(seq_name, fragments[seq_name]) for seq_name in gene_binning[gene]])) for gene in gene_binning]) print("Finding best orientation of reads\n") align_name = 'sate' if (options().genes == 'cogs'): align_name = 'pasta' for (gene, frags) in binned_fragments.items(): # Add reverse complement sequence frags_rev = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in frags.items()]) gene_frags = MutableAlignment() gene_frags.set_alignment(frags) gene_frags.set_alignment(frags_rev) gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(gene_frags, gene_file) # Now run HMMER search hmmer_search( gene_file, os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for key in frags: forward_score = -10000 backward_score = -10000 if (key in results): forward_score = results[key][1] if (key + "_rev" in results): backward_score = results[key + "_rev"][1] if (backward_score > forward_score): frags[key] = gene_frags[key + "_rev"] # Now write to file _write_fasta(frags, gene_file + ".fixed") binned_fragments[gene] = frags return binned_fragments
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name+'_rev', reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir+"/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, 'NA', 'NA']) for name in fragments.keys()]) gene_set = marker_genes align_name = 'sate' if (options().genes == 'cogs'): gene_set = cog_genes align_name = 'pasta' for gene in gene_set: # Now run HMMER search hmmer_search( frag_file, os.path.join( options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%.profile' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for name, value in results.items(): bitscore = value[1] direction = 'forward' true_name = name if (name.find('_rev') != -1): true_name = true_name.replace('_rev', '') direction = 'reverse' if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name, val in frag_scores.items(): if (val[1] not in genes): genes[val[1]] = {} if (val[2] == 'forward'): genes[val[1]][name] = fragments[name] else: genes[val[1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene, seq in genes.items(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(seq, gene_file + ".fixed") return genes
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name + '_rev', reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir + "/frags.fas" _write_fasta(all_frags, frag_file) #Now bin the fragments frag_scores = dict([(name, [-10000, 'NA', 'NA']) for name in fragments.keys()]) gene_set = marker_genes align_name = 'sate' if (options().genes == 'cogs'): gene_set = cog_genes align_name = 'pasta' for gene in gene_set: #Now run HMMER search hmmer_search( frag_file, os.path.join(options().__getattribute__('reference').path, 'refpkg/%s.refpkg/%.profile' % (gene, align_name)), temp_dir + "/%s.out" % gene) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) #Now select best direction for each frag for name in results.keys(): bitscore = results[name][1] direction = 'forward' true_name = name if (name.find('_rev') != -1): true_name = true_name.replace('_rev', '') direction = 'reverse' if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] #Now bin the fragments genes = dict([]) for name in frag_scores.keys(): if (frag_scores[name][1] not in genes): genes[frag_scores[name][1]] = {} if (frag_scores[name][2] == 'forward'): genes[frag_scores[name][1]][name] = fragments[name] else: genes[frag_scores[name][1]][name] = reverse_sequence( fragments[name]) genes.pop("NA", None) for gene in genes.keys(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(genes[gene], gene_file + ".fixed") return genes
def hmmer_to_markers(input, temp_dir): global marker_genes fragments = MutableAlignment() fragments.read_filepath(input) reverse = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in fragments.items()]) all_frags = MutableAlignment() all_frags.set_alignment(fragments) all_frags.set_alignment(reverse) frag_file = temp_dir + "/frags.fas" _write_fasta(all_frags, frag_file) # Now bin the fragments frag_scores = dict([(name, [-10000, "NA", "NA"]) for name in fragments.keys()]) gene_set = marker_genes align_name = "sate" if options().genes == "cogs": gene_set = cog_genes align_name = "pasta" for gene in gene_set: # Now run HMMER search hmmer_search( frag_file, os.path.join( options().__getattribute__("reference").path, "refpkg/%s.refpkg/%.profile" % (gene, align_name) ), temp_dir + "/%s.out" % gene, ) results = read_hmmsearch_results(temp_dir + "/%s.out" % gene) # Now select best direction for each frag for name in results.keys(): bitscore = results[name][1] direction = "forward" true_name = name if name.find("_rev") != -1: true_name = true_name.replace("_rev", "") direction = "reverse" if frag_scores[true_name][0] < bitscore: frag_scores[true_name] = [bitscore, gene, direction] # Now bin the fragments genes = dict([]) for name in frag_scores.keys(): if frag_scores[name][1] not in genes: genes[frag_scores[name][1]] = {} if frag_scores[name][2] == "forward": genes[frag_scores[name][1]][name] = fragments[name] else: genes[frag_scores[name][1]][name] = reverse_sequence(fragments[name]) genes.pop("NA", None) for gene in genes.keys(): gene_file = temp_dir + "/%s.frags.fas" % gene _write_fasta(genes[gene], gene_file + ".fixed") return genes
def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted([len(seq) for seq in sequences.values()]) lengths = len(seq_lengths) if lengths % 2: options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0 else: options().median_full_length = seq_lengths[lengths / 2] (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold))) frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in fragments.keys()] if (options().backbone_size is None): options().backbone_size = min(1000,int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) if (options().backbone_size > len(sequences.keys())): options().backbone_size = len(sequences.keys()) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) pastaalignJob.run() pastaalignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/pasta.fasta") options().tree_file = open(self.options.outdir + "/pasta.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): _LOG.info("No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta")) sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)