def check_gzip(genome_path): # One final check to see all downloaded files are unpacked logger.debug('Starting function "check_gzip"') for folder in os.listdir(genome_path): f = find_file(os.path.join(genome_path, folder), ['.gz'], []) if f: unpack(f)
def prodigal_worker(genome, settings, path): genome_path = os.path.join(path, genome) sco_file = os.path.join(genome_path, genome + '.sco') fasta_file = find_file(genome_path, ['fna', 'fasta'], ['genomic']) fasta_file = os.path.join(genome_path, fasta_file) out_genbank = os.path.join(genome_path, genome + '_prodigal.gbk') if settings['override_prodigal'] or not os.path.isfile(sco_file): # Always run if override_prodigal is set to do so, else only run if no sco file is found run_prodigal_cmd(fasta_file, sco_file) sr = SeqIO.parse(fasta_file, 'fasta') features, gene_nr, gene_names = parse_prodigal(sco_file, start_nr=1, prefix=genome + '_') all_seqs = [] scaf_names_count = {} for record in sr: renamed = False scaffold_name = record.id if scaffold_name not in scaf_names_count: scaf_names_count[scaffold_name] = 0 scaf_names_count[scaffold_name] += 1 scaffold_features = features[scaffold_name] record.features = scaffold_features all_seqs.append(record) if not os.path.isfile(out_genbank): write_genbank_single(all_seqs, out_genbank) return (genome, all_seqs, out_genbank, gene_names, scaf_names_count)
def process_from_fasta(genomes, path, override_prodigal=False): # Run prodigal for each genome # Parse the fasta via biopython # Make a feature list based on prodigals output and expand the seqrecord with it # Pass it onto regular genbank parsing genome_seqs = {} all_names = set() gene_nr = 1 for genome in genomes: genome_path = os.path.join(path, genome) sco_file = os.path.join(genome_path, genome + '.sco') fasta_file = find_file(genome_path, ['fna', 'fasta'], ['genomic']) fasta_file = os.path.join(genome_path, fasta_file) if override_prodigal or not os.path.isfile(sco_file): # Always run if override_prodigal is set to do so, else only run if no sco file is found run_prodigal_cmd(fasta_file, sco_file) sr = SeqIO.parse(fasta_file, 'fasta') features, gene_nr, names = parse_prodigal(sco_file, prev_nr=gene_nr) all_names = fuse_dict_add(all_names, names) all_seqs = [] for record in sr: name = record.id scaffold_features = features[name] record.features = scaffold_features all_seqs.append(record) genome_seqs[genome] = all_seqs # Get all scaffold names and gene names so that they can be checked for double names laters scaffold_names = get_scaffold_names(genome_seqs) # Write the files to a genbank file; also adds translations to the features files_parsed = write_genbank(genome_seqs, path) logger.debug('Prodigal parsing: renamed %i genes' % (gene_nr - 1)) return (genome_seqs, all_names, scaffold_names, files_parsed)
def find_downloaded_files(genome_path, prodigal, ext=False): logger.debug('Starting function "find_downloaded_files"') already_dl = [] for folder in os.listdir(genome_path): if ext: exts = [ext] + [ext.rpartition('.gz')[0]] else: if prodigal == 'never': exts = ['.gbk', '.gbff', '.gbk.gz', '.gbff.gz'] elif prodigal == 'always': exts = ['.fna', '.fna.gz'] else: exts = [ '.gbk', '.gbk.gz', '.gbff', '.gbff.gz', '.fna', '.fna.gz' ] f = find_file(os.path.join(genome_path, folder), exts, []) if f: already_dl.append(folder) return already_dl