def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break genome_dir, filename = os.path.split(gene_file) output_hit_file = os.path.join( genome_dir, filename.replace(self.protein_file_suffix, self.pfam_suffix)) cmd = 'pfam_search.pl -outfile %s -cpu %d -fasta %s -dir %s' % ( output_hit_file, self.cpus_per_genome, gene_file, self.pfam_hmm_dir) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close() # identify top hit for each gene self._topHit(output_hit_file) queueOut.put(gene_file)
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) running_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam.running')) if not os.path.exists(running_file): fout = open(running_file, 'w') fout.write('running') fout.close() output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam.tsv')) if not os.path.exists(output_hit_file): cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % (output_hit_file, gene_file, self.pfam_hmm_dir) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() if os.path.exists(running_file): os.remove(running_file) queueOut.put(gene_file)
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break assembly_dir, filename = os.path.split(gene_file) output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_suffix, self.tigrfam_suffix)) hmmsearch_out = os.path.join(assembly_dir, filename.replace(self.protein_file_suffix, '_tigrfam.out')) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % (hmmsearch_out, output_hit_file, self.cpus_per_genome, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close() # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put(gene_file)
def _tigr_top_hit(self, tigrfam_file, tigrfam_tophit_file): """Identify top TIGRfam hits.""" tophits = {} for line in open(tigrfam_file): if line[0] == '#' or line[0] == '[': continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[3] evalue = float(line_split[4]) bitscore = float(line_split[5]) if gene_id in tophits: if bitscore > tophits[gene_id][2]: tophits[gene_id] = (hmm_id, evalue, bitscore) else: tophits[gene_id] = (hmm_id, evalue, bitscore) fout = open(tigrfam_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, stats in tophits.iteritems(): hit_str = ','.join(map(str, stats)) fout.write('%s\t%s\n' % (gene_id, hit_str)) fout.close() # calculate checksum checksum = sha256(tigrfam_tophit_file) fout = open(tigrfam_tophit_file + '.sha256', 'w') fout.write(checksum) fout.close()
def __tigr_worker(self, queue_in, queue_out): """Process each data item in parallel.""" while True: gene_file = queue_in.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.tsv')) hmmsearch_out = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.out')) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % (hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() # determine top hits tigrfam_tophit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam_tophit.tsv')) self._tigr_top_hit(output_hit_file, tigrfam_tophit_file) # allow results to be processed or written to file queue_out.put(gene_file)
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) running_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, 'ko.running')) if not os.path.exists(running_file): fout = open(running_file, 'w') fout.write('running') fout.close() output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_ko.tsv')) if not os.path.exists(output_hit_file): cmd = 'diamond blastp -k 100 -e 0.01 -p %d -d %s -q %s -k 1 -e 1e-3 -f %s -o %s' % ( 1, self.uniprot_ko_db, gene_file, '6 qseqid qlen sseqid stitle slen length pident evalue bitscore', output_hit_file) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() if os.path.exists(running_file): os.remove(running_file) queueOut.put(gene_file)
def __workerThread(self, domain, queueIn, queueOut): """Process each data item in parallel.""" while True: genome_file = queueIn.get(block=True, timeout=None) if genome_file == None: break assembly_dir, filename = os.path.split(genome_file) prefix = filename.replace(self.genome_file_ext, '') output_dir = os.path.join(assembly_dir, 'prokka') if os.path.exists(output_dir): queueOut.put(genome_file) continue os.makedirs(output_dir) prokka_out = os.path.join(output_dir, 'prokka.out') cmd = 'prokka --force --kingdom %s --prefix %s --outdir %s --cpus 1 %s 2> %s' % (domain, prefix, output_dir, genome_file, prokka_out) os.system(cmd) # calculate checksum prokka_gene_file = os.path.join(output_dir, prefix + '.faa') checksum = sha256(prokka_gene_file) fout = open(prokka_gene_file + '.sha256', 'w') fout.write(checksum) fout.close() # allow results to be processed or written to file queueOut.put(genome_file)
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break genome_dir, filename = os.path.split(gene_file) output_hit_file = os.path.join(genome_dir, filename.replace(self.protein_file_suffix, self.pfam_suffix)) cmd = 'pfam_search.pl -outfile %s -cpu %d -fasta %s -dir %s' % (output_hit_file, self.cpus_per_genome, gene_file, self.pfam_hmm_dir) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close() # identify top hit for each gene self._topHit(output_hit_file) queueOut.put(gene_file)
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) running_file = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, '_pfam.running')) if not os.path.exists(running_file): fout = open(running_file, 'w') fout.write('running') fout.close() output_hit_file = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, '_pfam.tsv')) if not os.path.exists(output_hit_file): cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % ( output_hit_file, gene_file, self.pfam_hmm_dir) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() if os.path.exists(running_file): os.remove(running_file) queueOut.put(gene_file)
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" try: while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break genome_dir, filename = os.path.split(gene_file) genome_id = filename.replace(self.protein_file_suffix, '') output_hit_file = os.path.join( self.output_dir, genome_id, filename.replace(self.protein_file_suffix, self.pfam_suffix)) dir_path = os.path.dirname(os.path.realpath(__file__)) pfam_search_script = os.path.join(dir_path, 'pfam_search.pl') cmd = '%s -outfile %s -cpu %d -fasta %s -dir %s' % ( pfam_search_script, output_hit_file, self.cpus_per_genome, gene_file, self.pfam_hmm_dir) osexitcode = os.system(cmd) if osexitcode == 1: raise RuntimeError("Pfam_search has crashed") # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close() # identify top hit for each gene self._topHit(output_hit_file) queueOut.put(gene_file) except Exception as error: raise error
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) running_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.running')) if not os.path.exists(running_file): fout = open(running_file, 'w') fout.write('running') fout.close() output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.tsv')) hmmsearch_out = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.out')) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % (hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() if os.path.exists(running_file): os.remove(running_file) # allow results to be processed or written to file queueOut.put(gene_file)
def __workerThread(self, domain, queueIn, queueOut): """Process each data item in parallel.""" while True: genome_file = queueIn.get(block=True, timeout=None) if genome_file == None: break assembly_dir, filename = os.path.split(genome_file) prefix = filename.replace(self.genome_file_ext, "") output_dir = os.path.join(assembly_dir, "prokka") if os.path.exists(output_dir): queueOut.put(genome_file) continue os.makedirs(output_dir) prokka_out = os.path.join(output_dir, "prokka.out") cmd = "prokka --force --kingdom %s --prefix %s --outdir %s --cpus 1 %s 2> %s" % ( domain, prefix, output_dir, genome_file, prokka_out, ) os.system(cmd) # calculate checksum prokka_gene_file = os.path.join(output_dir, prefix + ".faa") checksum = sha256(prokka_gene_file) fout = open(prokka_gene_file + ".sha256", "w") fout.write(checksum) fout.close() # allow results to be processed or written to file queueOut.put(genome_file)
def __pfam_worker(self, queue_in, queue_out): """Process each data item in parallel.""" while True: gene_file = queue_in.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam.tsv')) cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % (output_hit_file, gene_file, self.pfam_hmm_dir) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() # determine top hits pfam_tophit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam_tophit.tsv')) self._pfam_top_hit(output_hit_file, pfam_tophit_file) # allow results to be processed or written to file queue_out.put(gene_file)
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break assembly_dir, filename = os.path.split(gene_file) output_hit_file = os.path.join( assembly_dir, filename.replace(self.protein_file_suffix, self.tigrfam_suffix)) hmmsearch_out = os.path.join( assembly_dir, filename.replace(self.protein_file_suffix, '_tigrfam.out')) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % ( hmmsearch_out, output_hit_file, self.cpus_per_genome, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close() # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put(gene_file)
def __pfam_worker(self, queue_in, queue_out): """Process each data item in parallel.""" pfam_version = 'pfam_33.1' pfam_extension = f'_{pfam_version}.tsv' pfam_tophit_extension = f'_{pfam_version}_tophit.tsv' symlink_pfam_extension = '_pfam.tsv' symlink_pfam_tophit_extension = '_pfam_tophit.tsv' while True: gene_file = queue_in.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) make_sure_path_exists(os.path.join(assembly_dir, pfam_version)) output_hit_file = os.path.join( assembly_dir, pfam_version, filename.replace(self.protein_file_ext, pfam_extension)) cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % ( output_hit_file, gene_file, self.pfam_hmm_dir) os.system(cmd) # print(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() # determine top hits pfam_tophit_file = os.path.join( assembly_dir, pfam_version, filename.replace(self.protein_file_ext, pfam_tophit_extension)) self._pfam_top_hit(output_hit_file, pfam_tophit_file) # create symlink in prodigal_folder new_hit_link = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, symlink_pfam_extension)) new_tophit_link = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, symlink_pfam_tophit_extension)) #================================================================== # print(f'{new_hit_link} will point to {output_hit_file}') # print(f'{new_tophit_link} will point to {pfam_tophit_file}') #================================================================== os.symlink(output_hit_file, new_hit_link) os.symlink(pfam_tophit_file, new_tophit_link) # allow results to be processed or written to file queue_out.put(gene_file)
def moveGenomes(self, db_genome_ids): """Move genome files into database directory structure. This function assumes addGenomes() has been called. It is not directly called by addGenomes() as all database queries are performed before moving genomes. Parameters ---------- db_genome_ids : list Unique database identifiers for genomes. """ assert(self.tmp_output_dir) # get database genome identifiers self.cur.execute("SELECT genomes.id,user_editable, external_id_prefix || '_' || id_at_source as external_id " + "FROM genomes, genome_sources " + "WHERE genome_source_id = genome_sources.id " + "AND genomes.id in %s", (tuple(db_genome_ids),)) external_id_dict = {} for (genome_id, user_editable, external_id) in self.cur: if user_editable: external_id_dict[genome_id] = external_id if len(external_id_dict.keys()) > 0: username = None if self.currentUser.isRootUser(): username = self.currentUser.getElevatedFromUsername() else: username = self.currentUser.getUsername() if username is None: raise GenomeDatabaseError( "Unable to determine user to add genomes under.") gtdb_target_dir = os.path.join(self.genomeCopyDir, username) for db_genome_id, external_id in external_id_dict.items(): tmp_genome_dir = os.path.join(self.tmp_output_dir, external_id) genome_target_dir = os.path.join(gtdb_target_dir, external_id) if os.path.exists(genome_target_dir): raise GenomeDatabaseError( "Genome directory already exists: %s" % genome_target_dir) shutil.move(tmp_genome_dir, genome_target_dir) self.cur.execute("UPDATE genomes SET fasta_file_location = %s , genes_file_location = %s , genes_file_sha256 = %s WHERE id = %s", ( os.path.join( username, external_id, external_id + self.genomeFileSuffix), os.path.join( username, external_id, self.userAnnotationDir, external_id + self.proteinFileSuffix), sha256(os.path.join(genome_target_dir, self.userAnnotationDir, external_id + self.proteinFileSuffix)), db_genome_id)) shutil.rmtree(self.tmp_output_dir)
def run(self, genome_dir, threads): # get path to all unprocessed TIGRfam HMM result files print 'Reading TIGRfam HMM files.' tigrfam_files = [] for genome_id in os.listdir(genome_dir): cur_genome_dir = os.path.join(genome_dir, genome_id) if os.path.isdir(cur_genome_dir): for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) groups = assembly_id.split('_') processed_assembly_id = '_'.join(groups[:2]) tigrfam_tophit_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + '_tigrfam_tophit.tsv') if os.path.exists(tigrfam_tophit_file): # verify checksum checksum_file = tigrfam_tophit_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(tigrfam_tophit_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue tigrfam_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + self.tigrfam_ext) if os.path.exists(tigrfam_file): tigrfam_files.append(tigrfam_file) print ' Number of unprocessed genomes: %d' % len(tigrfam_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in tigrfam_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [mp.Process(target=self.__workerThread, args=(workerQueue, writerQueue)) for _ in range(threads)] writeProc = mp.Process(target=self.__writerThread, args=(len(tigrfam_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def run(self, genome_dir, threads): # get path to all unprocessed Pfam HMM result files print 'Reading Pfam HMM files.' pfam_files = [] for genome_id in os.listdir(genome_dir): cur_genome_dir = os.path.join(genome_dir, genome_id) if os.path.isdir(cur_genome_dir): for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) groups = assembly_id.split('_') processed_assembly_id = '_'.join(groups[:2]) pfam_tophit_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + '_pfam_tophit.tsv') if os.path.exists(pfam_tophit_file): # verify checksum checksum_file = pfam_tophit_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(pfam_tophit_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue pfam_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + self.pfam_ext) if os.path.exists(pfam_file): pfam_files.append(pfam_file) print ' Number of unprocessed genomes: %d' % len(pfam_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in pfam_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [mp.Process(target=self.__workerThread, args=(workerQueue, writerQueue)) for _ in range(threads)] writeProc = mp.Process(target=self.__writerThread, args=(len(pfam_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def _topHit(self, pfam_file): """Determine top hits to PFAMs. A gene may be assigned to multiple PFAM families from the same clan. The search_pfam.pl script takes care of most of these issues and here the results are simply parsed. Parameters ---------- tigrfam_file : str Name of file containing hits to TIGRFAM HMMs. """ assembly_dir, filename = os.path.split(pfam_file) genome_id = filename.replace(self.pfam_suffix, '') output_tophit_file = os.path.join( self.output_dir, genome_id, filename.replace(self.pfam_suffix, self.pfam_top_hit_suffix)) tophits = defaultdict(dict) for line in open(pfam_file): if line[0] == '#' or not line.strip(): continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[5] evalue = float(line_split[12]) bitscore = float(line_split[11]) if gene_id in tophits: if hmm_id in tophits[gene_id]: if bitscore > tophits[gene_id][hmm_id][1]: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) fout = open(output_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, hits in tophits.iteritems(): hit_str = [] for hmm_id, stats in hits.iteritems(): hit_str.append(hmm_id + ',' + ','.join(map(str, stats))) fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str))) fout.close() # calculate checksum checksum = sha256(output_tophit_file) fout = open(output_tophit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close()
def _topHit(self, pfam_file): """Determine top hits to PFAMs. A gene may be assigned to multiple PFAM families from the same clan. The search_pfam.pl script takes care of most of these issues and here the results are simply parsed. Parameters ---------- tigrfam_file : str Name of file containing hits to TIGRFAM HMMs. """ assembly_dir, filename = os.path.split(pfam_file) output_tophit_file = os.path.join(assembly_dir, filename.replace(self.pfam_suffix, self.pfam_top_hit_suffix)) tophits = defaultdict(dict) for line in open(pfam_file): if line[0] == '#' or not line.strip(): continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[5] evalue = float(line_split[12]) bitscore = float(line_split[11]) if gene_id in tophits: if hmm_id in tophits[gene_id]: if bitscore > tophits[gene_id][hmm_id][1]: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) fout = open(output_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, hits in tophits.iteritems(): hit_str = [] for hmm_id, stats in hits.iteritems(): hit_str.append(hmm_id + ',' + ','.join(map(str, stats))) fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str))) fout.close() # calculate checksum checksum = sha256(output_tophit_file) fout = open(output_tophit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close()
def _runProdigal(self, fasta_path): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. """ temp_dir, fasta_file = os.path.split(fasta_path) output_dir = os.path.join(temp_dir, self.userAnnotationDir) genome_id = fasta_file[0:fasta_file.rfind('_')] prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir) summary_stats = summary_stats[summary_stats.keys()[0]] # rename output files to adhere to GTDB conventions aa_gene_file = os.path.join( output_dir, genome_id + ConfigMetadata.PROTEIN_FILE_SUFFIX) shutil.move(summary_stats.aa_gene_file, aa_gene_file) nt_gene_file = os.path.join( output_dir, genome_id + ConfigMetadata.NT_GENE_FILE_SUFFIX) shutil.move(summary_stats.nt_gene_file, nt_gene_file) gff_file = os.path.join(output_dir, genome_id + ConfigMetadata.GFF_FILE_SUFFIX) shutil.move(summary_stats.gff_file, gff_file) # save translation table information translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write( '%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100)) fout.write( '%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100)) fout.close() checksum = sha256(aa_gene_file) fout = open(aa_gene_file + ConfigMetadata.CHECKSUM_SUFFIX, 'w') fout.write(checksum) fout.close() return (aa_gene_file, nt_gene_file, gff_file, translation_table_file)
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: pfam_file = queueIn.get(block=True, timeout=None) if pfam_file is None: break assembly_dir, filename = os.path.split(pfam_file) output_tophit_file = os.path.join( assembly_dir, filename.replace(self.pfam_ext, '_pfam_tophit.tsv')) tophits = defaultdict(dict) for line in open(pfam_file): if line[0] == '#' or not line.strip(): continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[5] evalue = float(line_split[12]) bitscore = float(line_split[11]) if gene_id in tophits: if hmm_id in tophits[gene_id]: if bitscore > tophits[gene_id][hmm_id][1]: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) fout = open(output_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, hits in tophits.iteritems(): hit_str = [] for hmm_id, stats in hits.iteritems(): hit_str.append(hmm_id + ',' + ','.join(map(str, stats))) fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str))) fout.close() # calculate checksum checksum = sha256(output_tophit_file) fout = open(output_tophit_file + '.sha256', 'w') fout.write(checksum) fout.close() # allow results to be processed or written to file queueOut.put(pfam_file)
def _topHit(self, tigrfam_file): """Determine top hits to TIGRFAMs. A gene is assigned to a single TIGRFAM family. This will be the top hit among all TIGRFAM HMMs and pass the threshold for the HMM. Parameters ---------- tigrfam_file : str Name of file containing hits to TIGRFAM HMMs. """ assembly_dir, filename = os.path.split(tigrfam_file) genome_id = filename.replace(self.tigrfam_suffix, '') output_tophit_file = os.path.join( self.output_dir, genome_id, filename.replace(self.tigrfam_suffix, self.tigrfam_top_hit_suffix)) tophits = {} for line in open(tigrfam_file): if line[0] == '#': continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[3] evalue = float(line_split[4]) bitscore = float(line_split[5]) if gene_id in tophits: if bitscore > tophits[gene_id][2]: tophits[gene_id] = (hmm_id, evalue, bitscore) else: tophits[gene_id] = (hmm_id, evalue, bitscore) fout = open(output_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, stats in tophits.iteritems(): hit_str = ','.join(map(str, stats)) fout.write('%s\t%s\n' % (gene_id, hit_str)) fout.close() # calculate checksum checksum = sha256(output_tophit_file) fout = open(output_tophit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close()
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: pfam_file = queueIn.get(block=True, timeout=None) if pfam_file is None: break assembly_dir, filename = os.path.split(pfam_file) output_tophit_file = os.path.join(assembly_dir, filename.replace(self.pfam_ext, '_pfam_tophit.tsv')) tophits = defaultdict(dict) for line in open(pfam_file): if line[0] == '#' or not line.strip(): continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[5] evalue = float(line_split[12]) bitscore = float(line_split[11]) if gene_id in tophits: if hmm_id in tophits[gene_id]: if bitscore > tophits[gene_id][hmm_id][1]: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) fout = open(output_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, hits in tophits.iteritems(): hit_str = [] for hmm_id, stats in hits.iteritems(): hit_str.append(hmm_id + ',' + ','.join(map(str, stats))) fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str))) fout.close() # calculate checksum checksum = sha256(output_tophit_file) fout = open(output_tophit_file + '.sha256', 'w') fout.write(checksum) fout.close() # allow results to be processed or written to file queueOut.put(pfam_file)
def _topHit(self, tigrfam_file): """Determine top hits to TIGRFAMs. A gene is assigned to a single TIGRFAM family. This will be the top hit among all TIGRFAM HMMs and pass the threshold for the HMM. Parameters ---------- tigrfam_file : str Name of file containing hits to TIGRFAM HMMs. """ assembly_dir, filename = os.path.split(tigrfam_file) output_tophit_file = os.path.join(assembly_dir, filename.replace(self.tigrfam_suffix, self.tigrfam_top_hit_suffix)) tophits = {} for line in open(tigrfam_file): if line[0] == '#': continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[3] evalue = float(line_split[4]) bitscore = float(line_split[5]) if gene_id in tophits: if bitscore > tophits[gene_id][2]: tophits[gene_id] = (hmm_id, evalue, bitscore) else: tophits[gene_id] = (hmm_id, evalue, bitscore) fout = open(output_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, stats in tophits.iteritems(): hit_str = ','.join(map(str, stats)) fout.write('%s\t%s\n' % (gene_id, hit_str)) fout.close() # calculate checksum checksum = sha256(output_tophit_file) fout = open(output_tophit_file + self.checksum_suffix, 'w') fout.write(checksum) fout.close()
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: genome_file = queueIn.get(block=True, timeout=None) if genome_file == None: break assembly_dir, filename = os.path.split(genome_file) trna_dir = os.path.join(assembly_dir, 'trna') genome_id = filename[0:filename.find('_', 4)] if not os.path.exists(trna_dir): os.makedirs(trna_dir) output_file = os.path.join(trna_dir, genome_id + '_trna.tsv') log_file = os.path.join(trna_dir, genome_id + '_trna.log') stats_file = os.path.join(trna_dir, genome_id + '_trna_stats.tsv') domain_flag = '-B' if self.domain_dict.get(genome_id) == 'Archaea': domain_flag = '-A' #cmd = 'tRNAscan-SE %s -q -Q -o %s -m %s -l %s %s' % (domain_flag, output_file, stats_file, log_file, genome_file) # os.system(cmd) cmd_to_run = [ 'tRNAscan-SE', domain_flag, '-q', '-Q', '-o', output_file, '-m', stats_file, '-l', log_file, genome_file ] proc = subprocess.Popen(cmd_to_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() # print proc.returncode if proc.returncode != 0: raise RuntimeError( "%r failed, status code %s stdout %r stderr %r" % (cmd_to_run, proc.returncode, stdout, stderr)) checksum_file = open(output_file + '.sha256', 'w') checksum_file.write('{}\n'.format(sha256(output_file))) checksum_file.close() queueOut.put(genome_file)
def __workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) running_file = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.running')) if not os.path.exists(running_file): fout = open(running_file, 'w') fout.write('running') fout.close() output_hit_file = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.tsv')) hmmsearch_out = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.out')) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % ( hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() if os.path.exists(running_file): os.remove(running_file) # allow results to be processed or written to file queueOut.put(gene_file)
def _pfam_top_hit(self, pfam_file, pfam_tophit_file): """Identify top Pfam hits.""" tophits = defaultdict(dict) for line in open(pfam_file): if line[0] == '#' or not line.strip(): continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[5] evalue = float(line_split[12]) bitscore = float(line_split[11]) if gene_id in tophits: if hmm_id in tophits[gene_id]: if bitscore > tophits[gene_id][hmm_id][1]: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) else: tophits[gene_id][hmm_id] = (evalue, bitscore) fout = open(pfam_tophit_file, 'w') fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n') for gene_id, hits in tophits.iteritems(): hit_str = [] for hmm_id, stats in hits.iteritems(): hit_str.append(hmm_id + ',' + ','.join(map(str, stats))) fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str))) fout.close() # calculate checksum checksum = sha256(pfam_tophit_file) fout = open(pfam_tophit_file + '.sha256', 'w') fout.write(checksum) fout.close()
def run(self, genome_dir, domain, genome_list, threads): genomes_to_process = None if genome_list: genomes_to_process = set() for line in open(genome_list): line_split = line.strip().split('\t') genome_id = line_split[0] if genome_id.startswith('GB_') or genome_id.startswith('RS_'): genome_id = genome_id[3:] genomes_to_process.add(genome_id) # get path to all unprocessed genome gene files print 'Reading genomes.' genome_files = [] for genome_id in os.listdir(genome_dir): cur_genome_dir = os.path.join(genome_dir, genome_id) if os.path.isdir(cur_genome_dir): for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] if genomes_to_process and genome_id not in genomes_to_process: continue prokka_dir = os.path.join(assembly_dir, 'prokka') if os.path.exists(prokka_dir): continue prokka_file = os.path.join(prokka_dir, assembly_id + '.faa') if os.path.exists(prokka_file): # verify checksum checksum_file = prokka_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(prokka_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + self.genome_file_ext) if os.path.exists(genome_file): genome_files.append(genome_file) print ' Number of unprocessed genomes: %d\n' % len(genome_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in genome_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [mp.Process(target = self.__workerThread, args = (domain, workerQueue, writerQueue)) for _ in range(threads)] writeProc = mp.Process(target = self.__writerThread, args = (len(genome_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def run(self, input_dir, threads): # get path to all unprocessed Pfam HMM result files print 'Reading Pfam HMM files.' pfam_files = [] for first_three in os.listdir(input_dir): onethird_species_dir = os.path.join(input_dir, first_three) print onethird_species_dir if os.path.isfile(onethird_species_dir): continue for second_three in os.listdir(onethird_species_dir): twothird_species_dir = os.path.join(onethird_species_dir, second_three) # print twothird_species_dir if os.path.isfile(twothird_species_dir): continue for third_three in os.listdir(twothird_species_dir): threethird_species_dir = os.path.join( twothird_species_dir, third_three) # print threethird_species_dir if os.path.isfile(threethird_species_dir): continue for complete_name in os.listdir(threethird_species_dir): assembly_dir = os.path.join(threethird_species_dir, complete_name) if os.path.isfile(assembly_dir): continue groups = complete_name.split('_') processed_assembly_id = '_'.join(groups[:2]) pfam_tophit_file = os.path.join( assembly_dir, 'prodigal', processed_assembly_id + '_pfam_tophit.tsv') if os.path.exists(pfam_tophit_file): # verify checksum checksum_file = pfam_tophit_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(pfam_tophit_file) cur_checksum = open( checksum_file).readline().strip() if checksum == cur_checksum: continue pfam_file = os.path.join( assembly_dir, 'prodigal', processed_assembly_id + self.pfam_ext) if os.path.exists(pfam_file): pfam_files.append(pfam_file) print ' Number of unprocessed genomes: %d' % len(pfam_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in pfam_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [ mp.Process(target=self.__workerThread, args=(workerQueue, writerQueue)) for _ in range(threads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(pfam_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def _addGenomeToDB(self, fasta_file_path, name, desc, source, id_at_source, gene_path): """Add genome to database. Parameters ---------- fasta_file_path : str Path to genome FASTA file with nucleotide sequences. name : str Desired name of genome. desc : str Description of genome. source : str Source of genome. id_at_source : int ? gene_path : str Path to called genes in amino acid space. Returns ------- str Database identifier of genome. """ try: fasta_sha256_checksum = sha256(fasta_file_path) gene_sha256_checksum = None if gene_path is not None: gene_sha256_checksum = sha256(gene_path) if source is None: source = self.defaultGenomeSourceName self.cur.execute( "SELECT id, external_id_prefix, user_editable FROM genome_sources WHERE name = %s", (source, )) source_id = None for (db_id, _external_id_prefix, user_editable) in self.cur: if (not user_editable): if id_at_source is None: raise GenomeDatabaseError( "Cannot auto generate ids at source for the %s genome source." % source) if (not self.currentUser.isRootUser()): raise GenomeDatabaseError( "Only the root user can add genomes to the %s genome source." % source) source_id = db_id break if source_id is None: raise GenomeDatabaseError( "Could not find the %s genome source." % source) if id_at_source is None: # We use update to return a value. This update should fix the concurreny of multit thread using the same value. Update locks the cell during the transaction. self.cur.execute("SELECT update_last_auto(%s);", (source_id, )) id_at_source = str(self.cur.fetchone()[0]) added = datetime.datetime.now() owner_id = None if not self.currentUser.isRootUser(): owner_id = self.currentUser.getUserId() self.cur.execute( "SELECT id FROM genomes WHERE genome_source_id = %s AND id_at_source = %s", (source_id, id_at_source)) result = self.cur.fetchall() columns = "(name, description, owned_by_root, owner_id, fasta_file_location, " + \ "fasta_file_sha256, genes_file_location, genes_file_sha256,genome_source_id, id_at_source, date_added)" if len(result): raise GenomeDatabaseError( "Genome source '%s' already contains id '%s'. Use -f to force an overwrite." % (source, id_at_source)) self.cur.execute( "INSERT INTO genomes " + columns + " " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) " + "RETURNING id", (name, desc, self.currentUser.isRootUser(), owner_id, fasta_file_path, fasta_sha256_checksum, gene_path, gene_sha256_checksum, source_id, id_at_source, added)) (db_genome_id, ) = self.cur.fetchone() return db_genome_id except GenomeDatabaseError as e: raise e
def run(self, input_dir, tmp_dir, threads): # get path to all unprocessed genome files print 'Reading genomes.' genome_files = [] for genome_dir in os.listdir(input_dir): cur_genome_dir = os.path.join(input_dir, genome_dir) if not os.path.isdir(cur_genome_dir): continue for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] # check if prodigal has already been called aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa') if os.path.exists(aa_gene_file): # verify checksum checksum_file = aa_gene_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(aa_gene_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # run prodigal on each genome print 'Running prodigal.' prodigal = Prodigal(cpus=threads) summary_stats = prodigal.run(genome_files, output_dir=tmp_dir) # move results into individual genome directories print 'Moving files and calculating checksums.' for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join( prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()
def __tigrfam_worker(self, queue_in, queue_out): """Process each data item in parallel.""" tigrfam_version = 'tigrfam_15.0' tigrfam_extension = f'_{tigrfam_version}.tsv' tigrfam_tophit_extension = f'_{tigrfam_version}_tophit.tsv' symlink_tigrfam_extension = '_tigrfam.tsv' symlink_tigrfam_tophit_extension = '_tigrfam_tophit.tsv' while True: gene_file = queue_in.get(block=True, timeout=None) if gene_file == None: break assembly_dir, filename = os.path.split(gene_file) make_sure_path_exists(os.path.join(assembly_dir, tigrfam_version)) output_hit_file = os.path.join( assembly_dir, tigrfam_version, filename.replace(self.protein_file_ext, tigrfam_extension)) hmmsearch_out = os.path.join( assembly_dir, tigrfam_version, filename.replace(self.protein_file_ext, f'_{tigrfam_version}.out')) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % ( hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file) os.system(cmd) #================================================================== # print(cmd) #================================================================== # calculate checksum checksum = sha256(output_hit_file) fout = open(output_hit_file + '.sha256', 'w') fout.write(checksum) fout.close() # determine top hits tigrfam_tophit_file = os.path.join( assembly_dir, tigrfam_version, filename.replace(self.protein_file_ext, tigrfam_tophit_extension)) self._tigr_top_hit(output_hit_file, tigrfam_tophit_file) # create symlink in prodigal_folder new_hit_link = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, symlink_tigrfam_extension)) new_tophit_link = os.path.join( assembly_dir, filename.replace(self.protein_file_ext, symlink_tigrfam_tophit_extension)) #================================================================== # print(f'{new_hit_link} will point to {output_hit_file}') # print(f'{new_tophit_link} will point to {tigrfam_tophit_file}') #================================================================== os.symlink(output_hit_file, new_hit_link) os.symlink(tigrfam_tophit_file, new_tophit_link) # allow results to be processed or written to file queue_out.put(gene_file)
def run_hmmsearch(self, gtdb_genome_path_file, report, db): extension = "" name = "" worker = None if db == 'pfam': marker_folder = 'pfam_33.1' full_extension = '_pfam_33.1.tsv' symlink_extension = '_pfam.tsv' name = 'Pfam' worker = self.__pfam_worker elif db == 'tigrfam': marker_folder = 'tigrfam_15.0' full_extension = '_tigrfam_15.0.tsv' symlink_extension = '_tigrfam.tsv' #extension = '_tigrfam_15.0.tsv' name = 'Tigrfam' worker = self.__tigrfam_worker genomes_to_consider = set() for line in open(report): line_split = line.strip().split('\t') genome_id = line_split[1] attributes = line_split[2].split(';') for attribute in attributes: if attribute == 'new' or attribute == 'modified': genomes_to_consider.add(genome_id) self.logger.info( f'Identified {len(genomes_to_consider)} genomes as new or modified.' ) # get path to all unprocessed genome gene files self.logger.info('Checking genomes.') genome_files = [] countr = 0 for line in open(gtdb_genome_path_file): countr += 1 statusStr = '{} lines read.'.format(countr) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() line_split = line.strip().split('\t') gid = line_split[0] gpath = line_split[1] assembly_id = os.path.basename(os.path.normpath(gpath)) prodigal_dir = os.path.join(gpath, 'prodigal') marker_file = os.path.join(prodigal_dir, marker_folder, gid + full_extension) if os.path.exists(marker_file): #print("File exists: {}".format(marker_file)) # verify checksum checksum_file = marker_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(marker_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: if gid in genomes_to_consider: self.logger.warning( f'Genome {gid} is marked as new or modified, but already has {name} annotations.' ) self.logger.warning('Genome is being skipped!') continue self.logger.warning( f'Genome {gid} has {name} annotations, but an invalid checksum and was not marked for reannotation.' ) self.logger.warning(f'Genome will be reannotated.') elif gid not in genomes_to_consider: self.logger.warning( f'Genome {gid} has no {name} annotations, but is also not marked for processing?' ) self.logger.warning(f'Genome will be reannotated!') gene_file = os.path.join(prodigal_dir, gid + self.protein_file_ext) if os.path.exists(gene_file): if os.stat(gene_file).st_size == 0: self.logger.warning( f' Protein file appears to be empty: {gene_file}') else: genome_files.append(gene_file) self.logger.info(f'Number of unprocessed genomes: {len(genome_files)}') workerQueue = mp.Queue() writerQueue = mp.Queue() for f in genome_files: workerQueue.put(f) for _ in range(self.cpus): workerQueue.put(None) try: workerProc = [ mp.Process(target=worker, args=(workerQueue, writerQueue)) for _ in range(self.cpus) ] writeProc = mp.Process(target=self.__progress, args=(len(genome_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate
def run(self, gtdb_genome_path_file): genomes_to_consider = None # get path to all genome files self.logger.info('Reading genomes.') genome_files = [] countr = 0 for line in open(gtdb_genome_path_file): countr += 1 statusStr = '{} lines read.'.format(countr) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() line_split = line.strip().split('\t') gid = line_split[0] gpath = line_split[1] assembly_id = os.path.basename(os.path.normpath(gpath)) #genome_file = os.path.join(gpath, assembly_id + '_genomic.fna') #gff_file = os.path.join(gpath, 'prodigal', gid + '_protein.gff') trna_dir = os.path.join(gpath, 'trna') trna_file = os.path.join(trna_dir, gid + '_trna.tsv') if os.path.exists(trna_file): # verify checksum checksum_file = trna_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(trna_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: if genomes_to_consider and gid in genomes_to_consider: self.logger.warning( f'Genome {gid} is marked as new or modified, but already has tRNAs called.' ) self.logger.warning('Genome is being skipped!') continue self.logger.warning( f'Genome {gid} has tRNAs called, but an invalid checksum and was not marked for reannotation.' ) self.logger.warning('[WARNING] Genome will be reannotated.') elif genomes_to_consider and (gid not in genomes_to_consider): self.logger.warning( f'Genome {gid} has no Pfam annotations, but is also not marked for processing?' ) self.logger.warning('Genome will be reannotated!') genome_file = os.path.join(gpath, assembly_id + self.genome_file_ext) if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: self.logger.warning( f'Genome file appears to be empty: {gid}') else: genome_files.append(genome_file) self.logger.info( f' Number of unprocessed genomes: {len(genome_files)}') # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in genome_files: workerQueue.put(f) for _ in range(self.cpus): workerQueue.put(None) try: workerProc = [ mp.Process(target=self.__workerThread, args=(workerQueue, writerQueue)) for _ in range(self.cpus) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genome_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def run(self, genome_dir, genome_report, threads): # get list of genomes to consider genomes_to_consider = set() for line in open(genome_report): line_split = line.strip().split('\t') genome_id = line_split[1] attributes = line_split[2].split(';') for attribute in attributes: if attribute == 'new' or attribute == 'modified': genomes_to_consider.add(genome_id) print 'Identified %d genomes as new or modified.' % len(genomes_to_consider) # get path to all unprocessed genome gene files print 'Reading genomes.' gene_files = [] for species_dir in os.listdir(genome_dir): cur_genome_dir = os.path.join(genome_dir, species_dir) if os.path.isdir(cur_genome_dir): for assembly_id in os.listdir(cur_genome_dir): prodigal_dir = os.path.join(cur_genome_dir, assembly_id, 'prodigal') genome_id = assembly_id[0:assembly_id.find('_', 4)] ko_file = os.path.join(prodigal_dir, genome_id + '_ko.tsv') if os.path.exists(ko_file): # verify checksum checksum_file = ko_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(ko_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: if genome_id in genomes_to_consider: print '[WARNING] Genome %s is marked as new or modified, but already has KO annotations.' % genome_id print '[WARNING] Genome is being skipped!' continue print '[WARNING] Genome %s has KO annotations, but an invalid checksum and was not marked for reannotation.' % genome_id print '[WARNING] Genome will be reannotated.' elif genome_id not in genomes_to_consider: print '[WARNING] Genome %s has no KO annotations, but is also not marked for processing?' % genome_id print '[WARNING] Genome will be reannotated!' gene_file = os.path.join(prodigal_dir, genome_id + self.protein_file_ext) if os.path.exists(gene_file): if os.stat(gene_file).st_size == 0: print '[Warning] Protein file appears to be empty: %s' % gene_file else: gene_files.append(gene_file) print ' Number of unprocessed genomes: %d' % len(gene_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in gene_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [mp.Process(target = self.__workerThread, args = (workerQueue, writerQueue)) for _ in range(threads)] writeProc = mp.Process(target = self.__writerThread, args = (len(gene_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def run(self, genome_dir, genome_report, threads): # get list of genomes to consider genomes_to_consider = set() for line in open(genome_report): line_split = line.strip().split('\t') genome_id = line_split[1] attributes = line_split[2].split(';') for attribute in attributes: if attribute == 'new' or attribute == 'modified': genomes_to_consider.add(genome_id) print 'Identified %d genomes as new or modified.' % len(genomes_to_consider) # get path to all unprocessed genome gene files print 'Reading genomes.' gene_files = [] for species_dir in os.listdir(genome_dir): cur_genome_dir = os.path.join(genome_dir, species_dir) if os.path.isdir(cur_genome_dir): for assembly_id in os.listdir(cur_genome_dir): prodigal_dir = os.path.join(cur_genome_dir, assembly_id, 'prodigal') genome_id = assembly_id[0:assembly_id.find('_', 4)] pfam_file = os.path.join(prodigal_dir, genome_id + '_pfam.tsv') if os.path.exists(pfam_file): # verify checksum checksum_file = pfam_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(pfam_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: if genome_id in genomes_to_consider: print '[WARNING] Genome %s is marked as new or modified, but already has Pfam annotations.' % genome_id print '[WARNING] Genome is being skipped!' continue print '[WARNING] Genome %s has Pfam annotations, but an invalid checksum and was not marked for reannotation.' % genome_id print '[WARNING] Genome will be reannotated.' elif genome_id not in genomes_to_consider: print '[WARNING] Genome %s has no Pfam annotations, but is also not marked for processing?' % genome_id print '[WARNING] Genome will be reannotated!' gene_file = os.path.join(prodigal_dir, genome_id + self.protein_file_ext) if os.path.exists(gene_file): if os.stat(gene_file).st_size == 0: print '[Warning] Protein file appears to be empty: %s' % gene_file else: gene_files.append(gene_file) print ' Number of unprocessed genomes: %d' % len(gene_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in gene_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [mp.Process(target = self.__workerThread, args = (workerQueue, writerQueue)) for _ in range(threads)] writeProc = mp.Process(target = self.__writerThread, args = (len(gene_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def run(self, genome_dir, domain, genome_list, threads): genomes_to_process = None if genome_list: genomes_to_process = set() for line in open(genome_list): line_split = line.strip().split("\t") genome_id = line_split[0] if genome_id.startswith("GB_") or genome_id.startswith("RS_"): genome_id = genome_id[3:] genomes_to_process.add(genome_id) # get path to all unprocessed genome gene files print "Reading genomes." genome_files = [] for genome_id in os.listdir(genome_dir): cur_genome_dir = os.path.join(genome_dir, genome_id) if os.path.isdir(cur_genome_dir): for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0 : assembly_id.find("_", 4)] if genomes_to_process and genome_id not in genomes_to_process: continue prokka_dir = os.path.join(assembly_dir, "prokka") if os.path.exists(prokka_dir): continue prokka_file = os.path.join(prokka_dir, assembly_id + ".faa") if os.path.exists(prokka_file): # verify checksum checksum_file = prokka_file + ".sha256" if os.path.exists(checksum_file): checksum = sha256(prokka_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + self.genome_file_ext) if os.path.exists(genome_file): genome_files.append(genome_file) print " Number of unprocessed genomes: %d\n" % len(genome_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in genome_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [ mp.Process(target=self.__workerThread, args=(domain, workerQueue, writerQueue)) for _ in range(threads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genome_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def _run_prodigal(self, genome_paths): """Run Prodigal on genomes.""" # get genome path and translation table for each file self.logger.info('Determining genomic file and translation table for each of the %d genomes.' % len(genome_paths)) genome_files = [] translation_table = {} for gid, gpath in genome_paths.items(): assembly_id = os.path.basename(os.path.normpath(gpath)) canonical_gid = assembly_id[0:assembly_id.find('_', 4)] genome_file = os.path.join(gpath, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: self.logger.warning('Genomic file appears to be empty: %s' % genome_file) continue genome_files.append(genome_file) else: self.logger.warning('Genomic file appears to be missing: %s' % genome_file) gff_file = os.path.join(gpath, assembly_id + '_genomic.gff') if os.path.exists(gff_file): if os.stat(gff_file).st_size == 0: self.logger.warning('GFF appears to be empty: %s' % gff_file) continue tt = self._parse_translation_table(gff_file) if tt: translation_table[canonical_gid] = tt else: translation_table[canonical_gid] = None self.logger.warning('Unable to determine translation table for: %s' % gff_file) sys.exit(-1) else: self.logger.warning('GFF appears to be missing: %s' % gff_file) sys.exit(-1) # run Prodigal on each genome self.logger.info('Running Prodigal on %d genomes.' % len(genome_paths)) prodigal = Prodigal(cpus=self.cpus) summary_stats = prodigal.run(genome_files, translation_table=translation_table, output_dir=self.tmp_dir) # move results into individual genome directories self.logger.info('Moving files and calculating checksums.') for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) canonical_gid = genome_id[0:genome_id.find('_', 4)] aa_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(self.tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') if translation_table[canonical_gid]: fout.write('%s\t%d\t%s\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table, 'used table specified by NCBI')) else: fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()
def moveGenomes(self, db_genome_ids): """Move genome files into database directory structure. This function assumes addGenomes() has been called. It is not directly called by addGenomes() as all database queries are performed before moving genomes. Parameters ---------- db_genome_ids : list Unique database identifiers for genomes. """ assert self.tmp_output_dir # get database genome identifiers self.cur.execute( "SELECT genomes.id,user_editable, external_id_prefix || '_' || id_at_source as external_id " + "FROM genomes, genome_sources " + "WHERE genome_source_id = genome_sources.id " + "AND genomes.id in %s", (tuple(db_genome_ids),), ) external_id_dict = {} for (genome_id, user_editable, external_id) in self.cur: if user_editable: external_id_dict[genome_id] = external_id if len(external_id_dict.keys()) > 0: username = None if self.currentUser.isRootUser(): username = self.currentUser.getElevatedFromUsername() else: username = self.currentUser.getUsername() if username is None: raise GenomeDatabaseError("Unable to determine user to add genomes under.") gtdb_target_dir = os.path.join(self.genomeCopyDir, username) for db_genome_id, external_id in external_id_dict.items(): tmp_genome_dir = os.path.join(self.tmp_output_dir, external_id) genome_target_dir = os.path.join(gtdb_target_dir, external_id) if os.path.exists(genome_target_dir): raise GenomeDatabaseError("Genome directory already exists: %s" % genome_target_dir) shutil.move(tmp_genome_dir, genome_target_dir) self.cur.execute( "UPDATE genomes SET fasta_file_location = %s , genes_file_location = %s , genes_file_sha256 = %s WHERE id = %s", ( os.path.join(username, external_id, external_id + self.genomeFileSuffix), os.path.join(username, external_id, self.userAnnotationDir, external_id + self.proteinFileSuffix), sha256( os.path.join(genome_target_dir, self.userAnnotationDir, external_id + self.proteinFileSuffix) ), db_genome_id, ), ) shutil.rmtree(self.tmp_output_dir)
def run(self, genome_dir, domain, threads): genomes_to_consider = None # get path to all genome files print 'Reading genomes.' genome_files = [] for species_dir in os.listdir(genome_dir): cur_genome_dir = os.path.join(genome_dir, species_dir) if os.path.isdir(cur_genome_dir): for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) trna_dir = os.path.join(assembly_dir, 'trna') genome_id = assembly_id[0:assembly_id.find('_', 4)] trna_file = os.path.join(trna_dir, genome_id + '_trna.tsv') if os.path.exists(trna_file): # verify checksum checksum_file = trna_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(trna_file) cur_checksum = open( checksum_file).readline().strip() if checksum == cur_checksum: if genomes_to_consider and genome_id in genomes_to_consider: print '[WARNING] Genome %s is marked as new or modified, but already has tRNAs called.' % genome_id print '[WARNING] Genome is being skipped!' continue print '[WARNING] Genome %s has tRNAs called, but an invalid checksum and was not marked for reannotation.' % genome_id print '[WARNING] Genome will be reannotated.' elif genomes_to_consider and (genome_id not in genomes_to_consider): print '[WARNING] Genome %s has no Pfam annotations, but is also not marked for processing?' % genome_id print '[WARNING] Genome will be reannotated!' genome_file = os.path.join( assembly_dir, assembly_id + self.genome_file_ext) if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # populate worker queue with data to process workerQueue = mp.Queue() writerQueue = mp.Queue() for f in genome_files: workerQueue.put(f) for _ in range(threads): workerQueue.put(None) try: workerProc = [ mp.Process(target=self.__workerThread, args=(workerQueue, writerQueue, domain)) for _ in range(threads) ] writeProc = mp.Process(target=self.__writerThread, args=(len(genome_files), writerQueue)) writeProc.start() for p in workerProc: p.start() for p in workerProc: p.join() writerQueue.put(None) writeProc.join() except: for p in workerProc: p.terminate() writeProc.terminate()
def run(self, sra_path): query = ("SELECT last_auto_id from genome_sources where id =1") self.temp_cur.execute(query) last_id = int(self.temp_cur.fetchone()[0]) checkm_dict_original = self.parsecheckm(sra_path) print os.getlogin() sra_dirs = os.listdir(sra_path) for sra_dir in sra_dirs: temp_path = tempfile.mkdtemp() sra_dir = os.path.join(sra_path, sra_dir) print sra_dir if os.path.isdir(sra_dir): bins_dir = os.path.join(sra_dir, 'bins') genomes_bin = os.listdir(bins_dir) for genome in genomes_bin: if genome.endswith(".fna"): genome_prefix = genome[:-4] last_id += 1 temp_user_dir = os.path.join(temp_path, "U_" + str(last_id)) os.mkdir(temp_user_dir) print temp_user_dir print genome_prefix shutil.copyfile( os.path.join(bins_dir, genome), os.path.join(temp_user_dir, "U_" + str(last_id) + "_genomic.fna")) metadata_dir = os.path.join(sra_dir, 'metadata', genome_prefix) self.copytree(metadata_dir, temp_user_dir) prodigal_dir = os.path.join(temp_user_dir, 'prodigal') os.mkdir(prodigal_dir) for old_name in glob.glob(sra_dir + "/prodigal/" + genome_prefix + "_*"): new_name = old_name.replace( sra_dir + "/prodigal", prodigal_dir) new_name = new_name.replace( genome_prefix, "U_" + str(last_id)) shutil.copy(old_name, new_name) for old_name in glob.glob(sra_dir + "/pfam/" + genome_prefix + "_*"): new_name = old_name.replace( sra_dir + "/pfam", prodigal_dir) new_name = new_name.replace( genome_prefix, "U_" + str(last_id)) shutil.copy(old_name, new_name) for old_name in glob.glob(sra_dir + "/tigrfam/" + genome_prefix + "_*"): new_name = old_name.replace( sra_dir + "/tigrfam", prodigal_dir) new_name = new_name.replace( genome_prefix, "U_" + str(last_id)) shutil.copy(old_name, new_name) list_genome_details = [genome_prefix] list_genome_details.append( genome_prefix + " (21/07/2016)") # description list_genome_details.append(False) list_genome_details.append(30) fasta_file_path = os.path.join( os.path.join(os.getlogin(), genome_prefix, genome_prefix + "_genomic.fna")) list_genome_details.append(fasta_file_path) list_genome_details.append( sha256( os.path.join( temp_user_dir, "U_" + str(last_id) + "_genomic.fna"))) list_genome_details.append(1) list_genome_details.append(str(last_id)) list_genome_details.append("21-07-2016") list_genome_details.append(False) list_genome_details.append("21-07-2016") gene_file_path = os.path.join( os.getlogin(), genome_prefix, "prodigal", genome_prefix + "_protein.faa") list_genome_details.append(gene_file_path) list_genome_details.append( sha256( os.path.join( temp_user_dir, "prodigal", "U_" + str(last_id) + "_protein.faa"))) self.temp_cur.execute( "INSERT INTO genomes " + "(name,description,owned_by_root,owner_id,fasta_file_location,fasta_file_sha256,genome_source_id,id_at_source,date_added,has_changed,last_update,genes_file_location,genes_file_sha256) " + "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) RETURNING id ", list_genome_details) (new_gid, ) = self.temp_cur.fetchone() self.temp_cur.execute( "INSERT INTO metadata_nucleotide (id) VALUES ({0})" .format(new_gid)) self.temp_cur.execute( "INSERT INTO metadata_genes (id) VALUES ({0})". format(new_gid)) self.temp_cur.execute( "INSERT INTO metadata_taxonomy (id) VALUES ({0})". format(new_gid)) self.temp_cur.execute( "INSERT INTO metadata_ssu (id) VALUES ({0})". format(new_gid)) # insertion of metadata with open( os.path.join( temp_user_dir, "metadata.genome_nt.tsv")) as metntf: for line in metntf: line_tab = line.strip().split() self.temp_cur.execute( "UPDATE metadata_nucleotide set {0}=%s WHERE id =%s " .format(line_tab[0]), (line_tab[1], new_gid)) # insertion of metadata with open( os.path.join( temp_user_dir, "metadata.genome_gene.tsv")) as metgenef: for line in metgenef: line_tab = line.strip().split() self.temp_cur.execute( "UPDATE metadata_genes set {0}=%s WHERE id =%s " .format(line_tab[0]), (line_tab[1], new_gid)) for key, value in checkm_dict_original.get( genome_prefix).iteritems(): self.temp_cur.execute( "UPDATE metadata_genes set {0}=%s WHERE id =%s " .format(key), (value, new_gid)) shutil.copytree( os.path.join(temp_user_dir), os.path.join("/srv/db/gtdb/genomes/user", os.getlogin(), "U_" + str(last_id))) self.temp_con.commit() shutil.rmtree(temp_path)
def _addGenomeToDB(self, fasta_file_path, name, desc, source, id_at_source, gene_path): """Add genome to database. Parameters ---------- fasta_file_path : str Path to genome FASTA file with nucleotide sequences. name : str Desired name of genome. desc : str Description of genome. source : str Source of genome. id_at_source : int ? gene_path : str Path to called genes in amino acid space. Returns ------- str Database identifier of genome. """ try: fasta_sha256_checksum = sha256(fasta_file_path) gene_sha256_checksum = None if gene_path is not None: gene_sha256_checksum = sha256(gene_path) if source is None: source = self.defaultGenomeSourceName self.cur.execute( "SELECT id, external_id_prefix, user_editable FROM genome_sources WHERE name = %s", (source,) ) source_id = None for (db_id, _external_id_prefix, user_editable) in self.cur: if not user_editable: if id_at_source is None: raise GenomeDatabaseError( "Cannot auto generate ids at source for the %s genome source." % source ) if not self.currentUser.isRootUser(): raise GenomeDatabaseError( "Only the root user can add genomes to the %s genome source." % source ) source_id = db_id break if source_id is None: raise GenomeDatabaseError("Could not find the %s genome source." % source) if id_at_source is None: # We use update to return a value. This update should fix the concurreny of multit thread using the same value. Update locks the cell during the transaction. self.cur.execute("SELECT update_last_auto(%s);", (source_id,)) id_at_source = str(self.cur.fetchone()[0]) added = datetime.datetime.now() owner_id = None if not self.currentUser.isRootUser(): owner_id = self.currentUser.getUserId() self.cur.execute( "SELECT id FROM genomes WHERE genome_source_id = %s AND id_at_source = %s", (source_id, id_at_source) ) result = self.cur.fetchall() columns = ( "(name, description, owned_by_root, owner_id, fasta_file_location, " + "fasta_file_sha256, genes_file_location, genes_file_sha256,genome_source_id, id_at_source, date_added)" ) if len(result): raise GenomeDatabaseError( "Genome source '%s' already contains id '%s'. Use -f to force an overwrite." % (source, id_at_source) ) self.cur.execute( "INSERT INTO genomes " + columns + " " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) " + "RETURNING id", ( name, desc, self.currentUser.isRootUser(), owner_id, fasta_file_path, fasta_sha256_checksum, gene_path, gene_sha256_checksum, source_id, id_at_source, added, ), ) (db_genome_id,) = self.cur.fetchone() return db_genome_id except GenomeDatabaseError as e: raise e
def run(self, input_dir, tmp_dir, threads): # get path to all unprocessed genome files print 'Reading genomes.' genome_files = [] for genome_dir in os.listdir(input_dir): cur_genome_dir = os.path.join(input_dir, genome_dir) if not os.path.isdir(cur_genome_dir): continue for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] # check if prodigal has already been called if False: # for safety, I am just recalling genes for all genomes right now, # but this is very efficient aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa') if os.path.exists(aa_gene_file): # verify checksum checksum_file = aa_gene_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(aa_gene_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # run prodigal on each genome print 'Running prodigal.' prodigal = Prodigal(cpus=threads) summary_stats = prodigal.run(genome_files, output_dir=tmp_dir) # move results into individual genome directories print 'Moving files and calculating checksums.' for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()