def compute_alignments_for_PCs(self, PCs_dict): if self.skip_alignments: self.run.warning('Skipping gene alignments.') return PCs_dict r = terminal.Run() r.verbose = False self.progress.new('Aligning genes in protein sequences') self.progress.update('...') pc_names = list(PCs_dict.keys()) num_pcs = len(pc_names) for i in range(0, num_pcs): self.progress.update('%d of %d' % (i, num_pcs)) if i % 10 == 0 else None pc_name = pc_names[i] if len(PCs_dict[pc_name]) == 1: # this sequence is a singleton and does not need alignment continue gene_sequences_in_pc = [] for gene_entry in PCs_dict[pc_name]: sequence = self.genomes_storage.get_gene_sequence( gene_entry['genome_name'], gene_entry['gene_caller_id']) gene_sequences_in_pc.append( ('%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id']), sequence), ) # alignment if self.debug: self.run.info_single('Aligning sequences in PC %s' % pc_name, nl_before=2, nl_after=1) print(json.dumps(gene_sequences_in_pc, indent=2)) alignments = self.aligner(run=r).run_stdin(gene_sequences_in_pc) for gene_entry in PCs_dict[pc_name]: gene_entry['alignment_summary'] = utils.summarize_alignment( alignments['%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id'])]) self.progress.end() return PCs_dict
def alignment_worker(input_queue, output_queue, gene_clusters_dict, genomes_storage, align_with): # Note for future changes, this worker should not write anything to gene_clusters_dict # or genome_storage, changes will not be reflected to main process or other processes. aligner = aligners.select(align_with, quiet=True) r = terminal.Run() r.verbose = False # Main process needs to kill this worker after it receives all tasks because of this infinite loop while True: gene_cluster_name = input_queue.get(True) if len(gene_clusters_dict[gene_cluster_name]) == 1: # this sequence is a singleton and does not need alignment output_queue.put(None) continue gene_sequences_in_gene_cluster = [] for gene_entry in gene_clusters_dict[gene_cluster_name]: sequence = genomes_storage.get_gene_sequence( gene_entry['genome_name'], gene_entry['gene_caller_id']) gene_sequences_in_gene_cluster.append( ('%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id']), sequence), ) alignments = aligner( run=r).run_stdin(gene_sequences_in_gene_cluster) output = { 'name': gene_cluster_name, 'entry': copy.deepcopy(gene_clusters_dict[gene_cluster_name]) } for gene_entry in output['entry']: gene_entry['alignment_summary'] = utils.summarize_alignment( alignments['%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id'])]) output_queue.put(output)
def compute_alignments_for_PCs(self, protein_clusters_dict): if self.skip_alignments: self.run.warning('Skipping gene alignments.') return protein_clusters_dict r = terminal.Run() r.verbose = False muscle = Muscle(run=r) self.progress.new('Aligning genes in protein sequences') self.progress.update('...') pc_names = list(protein_clusters_dict.keys()) num_pcs = len(pc_names) for i in range(0, num_pcs): self.progress.update('%d of %d' % (i, num_pcs)) if i % 10 == 0 else None pc_name = pc_names[i] if len(protein_clusters_dict[pc_name]) == 1: # this sequence is a singleton and does not need alignment continue gene_sequences_in_pc = [] for gene_entry in protein_clusters_dict[pc_name]: sequence = self.genomes_storage.get_gene_sequence(gene_entry['genome_name'], gene_entry['gene_caller_id']) gene_sequences_in_pc.append(('%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id']), sequence),) # alignment alignments = muscle.run_muscle_stdin(gene_sequences_in_pc) for gene_entry in protein_clusters_dict[pc_name]: gene_entry['alignment_summary'] = utils.summarize_alignment(alignments['%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id'])]) self.progress.end() return protein_clusters_dict
def alignment_worker(input_queue, output_queue, gene_clusters_dict, genomes_storage, align_with, run): # Note for future changes, this worker should not write anything to gene_clusters_dict # or genome_storage, changes will not be reflected to main process or other processes. aligner = aligners.select(align_with, quiet=True) # this instance of Run is here because we don't want to create this over and over again # in the loop down below. there also is another run instance the worker gets to make sure # it can report its own messages .. don't be confused we-do-not-age-discriminate-here padawan. r = terminal.Run() r.verbose = False # Main process needs to kill this worker after it receives all tasks because of this infinite loop while True: gene_cluster_name = input_queue.get(True) if len(gene_clusters_dict[gene_cluster_name]) == 1: # this sequence is a singleton and does not need alignment output_queue.put(None) continue gene_sequences_in_gene_cluster = [] for gene_entry in gene_clusters_dict[gene_cluster_name]: sequence = genomes_storage.get_gene_sequence(gene_entry['genome_name'], gene_entry['gene_caller_id']) gene_sequences_in_gene_cluster.append(('%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id']), sequence),) # sometimes alignments fail, and because pangenomic analyses can take forever, # everything goes into the trash bin. to prevent that, here we have a try/except # block with lots of warnings if something goes wrong. try: alignments = aligner(run=r).run_stdin(gene_sequences_in_gene_cluster) except: # realm of sad face. before we continue to spam the user with error messages, # we turn our gene sequences to alignments without alignments. this worker will # report raw, unaligned sequences for this gene cluster as if they were aligned # so things will continue working operationally, and it will be on the user to # make sure they went through their results carefully. alignments = dict(gene_sequences_in_gene_cluster) # constructing our #sad: if anvio.DEBUG: temp_file_path = filesnpaths.get_temp_file_path(prefix='ANVIO_GC_%s' % (gene_cluster_name)) with open(temp_file_path, 'w') as output: for tpl in gene_sequences_in_gene_cluster: output.write('>%s\n%s\n' % (tpl[0], tpl[1])) debug_info = "The %d sequences in gene cluster %s are stored in the temporary file '%s'" % \ (len(gene_sequences_in_gene_cluster), gene_cluster_name, temp_file_path) else: debug_info = "If you re-run your last command with a `--debug` flag, anvi'o will generate more\ information for you about the contenets of this gene cluster (but if you are seeing\ millions of these warnings, it may not be a good idea since with the `--debug` flag\ anvi'o will generate a FASTA file in a temporary directory with the contents of the\ gene cluster, and will not attempt to delete them later)." run.warning("VERY BAD NEWS. The alignment of seqeunces with '%s' in the gene cluster '%s' failed\ for some reason. Since the real answer to 'why' is too deep in the matrix, there is\ no reliable solution for anvi'o to find it for you, BUT THIS WILL AFFECT YOUR SCIENCE\ GOING FORWARD, SO YOU SHOULD CONSIDER ADDRESSING THIS ISSUE FIRST. %s" % \ (aligner.__name__, gene_cluster_name, debug_info), nl_before=1) output = {'name': gene_cluster_name, 'entry': copy.deepcopy(gene_clusters_dict[gene_cluster_name])} for gene_entry in output['entry']: gene_entry['alignment_summary'] = utils.summarize_alignment(alignments['%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id'])]) output_queue.put(output)