Example #1
0
def check_gzip(genome_path):
    # One final check to see all downloaded files are unpacked
    logger.debug('Starting function "check_gzip"')
    for folder in os.listdir(genome_path):
        f = find_file(os.path.join(genome_path, folder), ['.gz'], [])
        if f:
            unpack(f)
Example #2
0
def prodigal_worker(genome, settings, path):
    genome_path = os.path.join(path, genome)
    sco_file = os.path.join(genome_path, genome + '.sco')
    fasta_file = find_file(genome_path, ['fna', 'fasta'], ['genomic'])
    fasta_file = os.path.join(genome_path, fasta_file)
    out_genbank = os.path.join(genome_path, genome + '_prodigal.gbk')
    if settings['override_prodigal'] or not os.path.isfile(sco_file):
        # Always run if override_prodigal is set to do so, else only run if no sco file is found
        run_prodigal_cmd(fasta_file, sco_file)

    sr = SeqIO.parse(fasta_file, 'fasta')
    features, gene_nr, gene_names = parse_prodigal(sco_file,
                                                   start_nr=1,
                                                   prefix=genome + '_')
    all_seqs = []
    scaf_names_count = {}
    for record in sr:
        renamed = False
        scaffold_name = record.id
        if scaffold_name not in scaf_names_count:
            scaf_names_count[scaffold_name] = 0
        scaf_names_count[scaffold_name] += 1
        scaffold_features = features[scaffold_name]
        record.features = scaffold_features
        all_seqs.append(record)
    if not os.path.isfile(out_genbank):
        write_genbank_single(all_seqs, out_genbank)
    return (genome, all_seqs, out_genbank, gene_names, scaf_names_count)
Example #3
0
def process_from_fasta(genomes, path, override_prodigal=False):
    # Run prodigal for each genome
    # Parse the fasta via biopython
    # Make a feature list based on prodigals output and expand the seqrecord with it
    # Pass it onto regular genbank parsing
    genome_seqs = {}
    all_names = set()
    gene_nr = 1

    for genome in genomes:
        genome_path = os.path.join(path, genome)
        sco_file = os.path.join(genome_path, genome + '.sco')
        fasta_file = find_file(genome_path, ['fna', 'fasta'], ['genomic'])
        fasta_file = os.path.join(genome_path, fasta_file)
        if override_prodigal or not os.path.isfile(sco_file):
            # Always run if override_prodigal is set to do so, else only run if no sco file is found
            run_prodigal_cmd(fasta_file, sco_file)

        sr = SeqIO.parse(fasta_file, 'fasta')
        features, gene_nr, names = parse_prodigal(sco_file, prev_nr=gene_nr)
        all_names = fuse_dict_add(all_names, names)
        all_seqs = []
        for record in sr:
            name = record.id
            scaffold_features = features[name]
            record.features = scaffold_features
            all_seqs.append(record)
        genome_seqs[genome] = all_seqs
    # Get all scaffold names and gene names so that they can be checked for double names laters
    scaffold_names = get_scaffold_names(genome_seqs)
    # Write the files to a genbank file; also adds translations to the features
    files_parsed = write_genbank(genome_seqs, path)
    logger.debug('Prodigal parsing: renamed %i genes' % (gene_nr - 1))
    return (genome_seqs, all_names, scaffold_names, files_parsed)
Example #4
0
def find_downloaded_files(genome_path, prodigal, ext=False):
    logger.debug('Starting function "find_downloaded_files"')
    already_dl = []
    for folder in os.listdir(genome_path):
        if ext:
            exts = [ext] + [ext.rpartition('.gz')[0]]
        else:
            if prodigal == 'never':
                exts = ['.gbk', '.gbff', '.gbk.gz', '.gbff.gz']
            elif prodigal == 'always':
                exts = ['.fna', '.fna.gz']
            else:
                exts = [
                    '.gbk', '.gbk.gz', '.gbff', '.gbff.gz', '.fna', '.fna.gz'
                ]

        f = find_file(os.path.join(genome_path, folder), exts, [])
        if f:
            already_dl.append(folder)
    return already_dl