Esempi in Python per time_stamp_logfile

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: fasta_lib

Metodo/funzione: time_stamp_logfile

Esempi su hotexamples.com: 12

time_stamp_logfile in Python: 12 esempi trovati. Questi sono i migliori esempi reali in Python per fasta_lib.time_stamp_logfile, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: sprot_get_analyze.py Progetto: smooshi/fasta_utilities

def main(db, folder, versions):
    """Fetches and analyzes the species names in Sprot database.

    Arguments:
    "folder" is full path name to UniProt DBs.  "versions" is
    a dictionary of version numbers for file naming.  No return values.

    Saves a summary text file that can be loaded into EXCEL.
    """
    print(
        '====================================================================')
    print(
        ' sprot_get_analyze.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print(
        '====================================================================')

    # create a log file to mirror screen output
    log_obj = open(os.path.join(folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: sprot_get_analyze.py',
                                 log_obj)

    # make sure the files are present or download if not
    fasta_lib.download_uniprot(db, folder, versions)

    # make dictionaries of species names (or accession IDs) to taxon IDs
    (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(folder)

    # get more complete list of names to taxon number from ncbi data
    name_to_taxon = fasta_lib.make_all_names_to_taxon(folder)

    # make species frequency dictionary
    fname = 'uniprot_sprot_%s.fasta.gz' % (versions['sprot'], )
    (name_freq, name_to_id, prot_count) = fasta_lib.uniprot_species_frequency(
        os.path.join(folder, fname))

    # sort the species names and write to file
    fasta_lib.save_species_info(db,
                                folder,
                                name_freq,
                                name_to_taxon,
                                sci_to_taxon,
                                id_to_taxon,
                                name_to_id,
                                minimum=min_sequence_count)

    # print out some stats and exit
    new_db = 'uniprot_%s_%s.fasta.gz' % (
        db,
        versions[db],
    )
    for obj in write:
        print('...%s contained %s entries...' % (new_db, prot_count), file=obj)
        print('...there were', len(name_freq), 'species names...', file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: sprot_get_analyze.py', log_obj)
    log_obj.close()
    return

Esempio n. 2

Mostra file

def fasta_counter(fasta_file):
    """Counts entries in a FASTA protein database.
        Call with FASTA filename.
        Checks for duplicate accessions and (optional) valid characters.
    """
    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj)

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot = 0
    head = 0
    conflict = {}

    # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4
    while f.readNextProtein(p, check_for_errs=False):

        # count protein sequences
        prot += 1
        if (prot % 500000) == 0:
            print('......(%s proteins read...)' % ("{0:,d}".format(prot), ))

##        # check for duplicate accession
##        dup = conflict.get(p.accession, False)
##        if dup:
##            for obj in write:
##                print('\n...WARNING: %s is already in FASTA database!\n' % (p.accession,), file=obj)
##                if p.molwtProtein(show_errs=False) == conflict[p.accession]:
##                    print('......possible duplicated sequence...', file=obj)
##        else:
##            conflict[p.accession] = p.molwtProtein(show_errs=False)

# count number of header elements
        control_A = p.description.count(chr(1))
        head = head + control_A + 1

    # print results and return
    for obj in write:
        print('...there are %s proteins in %s' %
              ("{0:,d}".format(prot), os.path.split(fasta_file)[1]),
              file=obj)
        if head > prot:
            print('...there were %s header lines' % ("{0:,d}".format(head), ),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj)
    log_obj.close()
    return

Esempio n. 3

Mostra file

File: nr_get_analyze.py Progetto: pwilmart/fasta_utilities

def main(db, folder):
    """Fetches and analyzes the species names in the ncbi nr fasta database.

    Arguments:
    "nr_folder" is the full path name where "nr.gz" will be.  No return values.

    Saves the main lookup dictionary and a summary text file that
    can be loaded into EXCEL or a word processor.
    """
    global min_sequence_count

    print('================================================================')
    print(' nr_get_analyze.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017')
    print('================================================================')

    # create a log file to mirror screen output
    log_obj = open(os.path.join(folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: nr_get_analyze.py', log_obj)

    # make sure the files are present or download if not
    fasta_lib.download_ncbi(folder)

    # make gi_to_taxon object (or reload from disk)
    acc_to_taxon = fasta_lib.AccToTaxon(folder)
    acc_to_taxon.create_or_load(folder)

    # make a dictionary of taxon IDs to species names
    taxon_to_name = fasta_lib.make_taxon_to_sci_name(folder)

    # make the taxon frequency dictionary for the proteins in nr.gz
    nr_name = os.path.split(folder)[1] + '.gz'
    for obj in write:
        print('...processing %s (this takes a few hours...)' % (nr_name, ),
              file=obj)
    taxon_freq = {}
    reftax_freq = {}
    prot = 0
    spec_prot = 0
    ref_prot = 0
    undef_gi = 0
    fasta_file = gzip.open(os.path.join(folder, nr_name), mode='rt')
    while True:
        line = fasta_file.readline()
        if not line:
            break
        else:
            line = line.rstrip()
        if line.startswith('>'):
            prot += 1
            chunk = 1000000
            if (prot % chunk) == 0:
                print('......(%s proteins read)' % ("{0:,d}".format(prot), ))
            tax_list = []
            reftax_list = []
            for header in line[1:].split(
                    chr(1)):  # need to remove the leading ">" character
                acc_ver = header.split()[0]
                acc = acc_ver.split('.')[0]
                tax = acc_to_taxon.get(acc, -1)
                if tax == -1:
                    undef_gi += 1
                if tax not in tax_list:
                    spec_prot += 1
                    tax_list.append(tax)
                if '_' in acc and tax not in reftax_list:  # according to NCBI underscore char only in RefSeq
                    ref_prot += 1
                    reftax_list.append(tax)
            for tax in tax_list:
                fasta_lib.add_or_increment(tax, taxon_freq)
            for reftax in reftax_list:
                fasta_lib.add_or_increment(reftax, reftax_freq)

    # make the name frequency dictionary from the taxon frequency dictionary
    name_freq = {}
    for (taxon, freq) in taxon_freq.items():
        unknown_name = 'Unknown_taxonID_%s' % (taxon, )
        name_freq[taxon_to_name.get(taxon, unknown_name)] = freq

    # make an inverted name_to_taxon dictionary
    name_to_taxon = {}
    for (number, name) in taxon_to_name.items():
        name_to_taxon[name] = number

    # sort the species names and write to file
    fasta_lib.save_species_info_nr(folder, name_freq, name_to_taxon,
                                   reftax_freq, min_sequence_count)

    # print out some stats and exit
    for obj in write:
        if prot > 0:
            print('...%s contained %s protein entries...' %
                  (os.path.split(folder)[1], "{0:,d}".format(prot)),
                  file=obj)
        print('...there were %s species-expanded entries...' %
              ("{0:,d}".format(spec_prot), ),
              file=obj)
        print('...%s were RefSeq entries...' % ("{0:,d}".format(ref_prot), ),
              file=obj)
        print('...%s entries had undefined taxon ID numbers...' %
              ("{0:,d}".format(undef_gi), ),
              file=obj)
        print('...there were',
              "{0:,d}".format(len(name_freq)),
              'species names...',
              file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: nr_get_analyze.py', log_obj)
    log_obj.close()
    return

Esempio n. 4

Mostra file

File: uniprot_extract_from_both.py Progetto: smooshi/fasta_utilities

def main(taxon_dict):
    """Extracts entries by taxon ID from both Sprot and Trembl databases.
    """
    print(
        '============================================================================='
    )
    print(
        ' uniprot_extract_from_both.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '============================================================================='
    )

    # get the UniProt folder and then get the sprot and trembl database names
    DB = []
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    uniprot_folder = fasta_lib.get_folder(
        default, title_string='Select a UniProt download folder')
    if uniprot_folder == '': sys.exit()  # cancel button response

    version = uniprot_folder.split('_')[-1]
    uniprot_db = 'uniprot'
    for files in os.listdir(uniprot_folder):
        if files.startswith('uniprot_') and files.endswith('.gz'):
            DB.append(os.path.join(uniprot_folder, files))
    if len(DB) != 2:
        print('WARNING: either sprot or trembl DB was missing')

    # create a log file to mirror screen output
    log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile(
        '\n>>> starting: uniprot_extract_from_both.py', log_obj)

    # make the smaller uniprot dictionaries
    (sci_to_taxon,
     id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder)

    # make the more complete dictionary
    name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder)

    # print the list of taxon numbers that will be extracted
    # NOTE: Any taxon numbers present in analysis text file will not be expanded.
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers
    if EXPAND_GROUPS:
        fasta_lib.expand_species(uniprot_folder, 'uniprot', taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT)

    # inititalize dictionaries and counters
    taxon_files, taxon_count, name_count = {}, {}, {}
    for taxon, name in taxon_dict.items():
        fname = uniprot_db + '_' + version + '_' + name + '.fasta'
        fname = os.path.join(uniprot_folder, fname)
        taxon_files[name] = fname
        taxon_count[taxon] = 0
        name_count[name] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # want to count extracted sequences from each database
    name_counter = {}
    number_counter = {}

    # loop over both databases and extract species
    duplicates = {}
    for i in range(len(DB)):
        prot_read = 0
        not_found = 0
        for value in taxon_dict.values():
            name_counter[value] = 0
        for key in taxon_dict.keys():
            number_counter[key] = 0

        # create a FastaReader object, initialize counters, and start reading
        uniprot_file = DB[i]
        x = fasta_lib.FastaReader(uniprot_file)
        prot = fasta_lib.Protein()
        for obj in write:
            print('...reading %s and extracting entries...' %
                  (os.path.split(uniprot_file)[1], ),
                  file=obj)

        # NOTE: checking for errors will slow program execution, use if needed
        while x.readNextProtein(prot, check_for_errs=False):
            prot_read += 1
            if (prot_read % 500000) == 0:
                print('......(%s proteins read...)' %
                      ("{0:,d}".format(prot_read), ))
            (spec_id,
             spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' +
                                                       prot.description)
            taxon = sci_to_taxon.get(spec_name, 0)  # first choice mapping
            taxon2 = name_to_taxon.get(spec_name, 0)  # alternative mapping
            if taxon == 0:  # first choice not present
                if taxon2 == 0:
                    not_found += 1
                else:
                    taxon = taxon2  # use second choice
            else:
                if (taxon != taxon2) and (
                        taxon2 > 0):  # keep track of multiple taxon numbers
                    duplicates[spec_name] = (taxon, taxon2)
            if taxon_dict.get(taxon, False):
                if CLEAN_ACCESSIONS:
                    prot.parseUniProt()

                # taxon number matches, so write the protein to respective output file(s)
                name = taxon_dict[taxon]
                name_counter[name] += 1
                name_count[name] += 1
                taxon_count[taxon] += 1
                number_counter[taxon] += 1
                f = taxon_files[name]
                prot.printProtein(f)

        # print extraction stats for each database
        for obj in write:
            print('...%s protein entries in %s' %
                  ("{0:,d}".format(prot_read), os.path.split(DB[0])[1]),
                  file=obj)
            print('...%s proteins had unknown taxon numbers' %
                  ("{0:,d}".format(not_found), ),
                  file=obj)
            numbers = list(number_counter.keys())
            numbers.sort()
            if VERBOSE:
                for j, number in enumerate(numbers):
                    if number_counter[number] > 0:
                        print('......(%s) taxon %s had %s proteins' %
                              (j + 1, number, "{0:,d}".format(
                                  number_counter[number])),
                              file=obj)
            names = list(name_counter.keys())
            names.sort()
            db_name = os.path.split(DB[i])[1]
            for j, name in enumerate(names):
                print('......(%s) %s %s proteins extracted' %
                      (j + 1, "{0:,d}".format(name_counter[name]), name),
                      file=obj)

    # close the extracted database files
    for f in taxon_files.values():
        f.close()

    # print list of mis-matched taxon number warnings
    if MISMATCHES:
        for i, (name, pair) in enumerate(duplicates.items()):
            for obj in write:
                print('......(%s) WARNING: %s and %s map to "%s"' %
                      (i + 1, pair[0], pair[1], name),
                      file=obj)

    # print out the final summary stuff
    for obj in write:
        if VERBOSE:
            print('...combined taxon counts...', file=obj)
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...combined output file counts...', file=obj)
        for i, name in enumerate(names):
            print('......(%s) %s total proteins written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]),
                   uniprot_db + '_' + version + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_both.py',
                                 log_obj)
    log_obj.close()
    return

Esempio n. 5

Mostra file

File: check_fasta.py Progetto: pwilmart/fasta_utilities

# browse to the database
database = r"C:\Xcalibur\database"
if not os.path.exists(database):
    database = os.getcwd()
file_ext_list = [('FASTA files', '*.fasta'), ('FASTA files', '*.fa'),
                 ('FASTA files', '*.gz')]
fasta_files = fasta_lib.get_files(database, file_ext_list, 'Select a FASTA database')
if not fasta_files:
    sys.exit()     # cancel button repsonse

# create a log file to mirror screen output
_folder = os.path.split(fasta_files[0])[0]
log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
write = [None, log_obj]
fasta_lib.time_stamp_logfile('\n>>> starting: check_fasta.py', log_obj)

# process the FASTA files
for fasta_file in fasta_files:
    try:
        fasta_checker(fasta_file, write)
    except FileNotFoundError:
        pass
    for obj in write:
        print(file=obj)

# finish up the log file
fasta_lib.time_stamp_logfile('>>> ending: check_fasta.py', log_obj)
log_obj.close()

# end

Esempio n. 6

Mostra file

File: remove_duplicates.py Progetto: smooshi/fasta_utilities

def main(fasta_file):
    """Checks entries in a FASTA protein database for identical duplicates.
        Call with FASTA filename, returns a couple of dictionaries
    """
    print(
        '====================================================================')
    print(
        ' remove_duplicates.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print(
        '====================================================================')
    print('\n   Analysis results will be written to "duplicates.txt"')

    # set up the output file names, etc.
    folder = os.path.split(fasta_file)[0]
    out_file = os.path.join(folder, 'duplicates.txt')
    out_obj = open(out_file, 'a')
    for end in ['.fasta', '.fasta.gz', '.fa.gz']:
        if fasta_file.endswith(end):
            nr_database = fasta_file.replace(end, '_nonredun.fasta')
    if (not nr_database) or (nr_database == fasta_file):
        nr_database = fasta_file + '_nonredun.fasta'
    nr_obj = open(nr_database, 'w')
    write = [None, out_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: check_for_duplicates.py',
                                 out_obj)
    #

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot, head, dup = 0, 0, 0
    candidates = {}  # dictionary of acc:protein lists
    conflicts = {}  # keeps track of seq len and MW

    # read proteins until EOF
    while f.readNextProtein(p, check_for_errs=False):
        prot += 1
        control_A = p.description.count(chr(1))
        head = head + control_A + 1
        dup_data = (p.seqlenProtein(), p.molwtProtein())
        duplicate = conflicts.get(dup_data, False)
        if duplicate:
            dup += 1
            if candidates.get(duplicate, False):
                candidates[duplicate].append(copy.deepcopy(p))
            else:
                candidates[duplicate] = [copy.deepcopy(p)]
        else:
            conflicts[dup_data] = p.accession

    # get list of proteins to test for identity
    to_test = {}
    for key in candidates.keys():
        to_test[key] = True

    # copy proteins to "nr" file, checking for duplicates
    dup = 0
    skip = {}
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    print('Processing:', fasta_file, file=out_obj)  # header line to log file
    while f.readNextProtein(p, check_for_errs=False):
        if to_test.get(p.accession, False):
            dup += find_identities(p, candidates, skip, fasta_file, out_obj)
        if skip.get(p.accession, False):
            continue
        p.printProtein(nr_obj)

    for obj in [None, out_obj]:
        print('\nThere were',
              prot,
              'total sequences in:',
              os.path.basename(fasta_file),
              file=obj)
        print('There were', dup, 'identical sequences removed\n\n', file=obj)
        try:
            obj.close()
        except AttributeError:
            pass
    return

Esempio n. 7

Mostra file

File: add_extras_and_reverse.py Progetto: pwilmart/fasta_utilities

def fasta_add_extras(extra_file, fasta_file, output_file):
    """Adds contaminants and reverses entries in a FASTA protein database.
        Called with FASTA filename.  Reversed DB written to same location.
        Options for separate or concatenated output files.
    """
    decoy_string = 'REV_'   # the string to denote decoy sequences

    print('=========================================================================')
    print(' add_extras_and_reverse.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print('=========================================================================')

    # open the "forward" and "reversed" output files
    _file = os.path.splitext(output_file)[0] + '.fasta'
    for_name = _file.replace('.fasta', '_for.fasta')
    for_file_obj = open(for_name, 'w')
    rev_name = _file.replace('.fasta', '_rev.fasta')
    rev_file_obj = open(rev_name, 'w')

    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: add_extras_and_reverse.py', log_obj)

    # create instances of reader object and protein object
    # add the extra sequences, accessions changed to "EXTRA_dddd"
    # NOTE: can only add up to 9999 sequences without code modification...
    prot = fasta_lib.Protein()
    pcount = 0
    f = fasta_lib.FastaReader(extra_file)

    # turn on error checking for extra sequences
    while f.readNextProtein(prot, check_for_errs=True):
        pcount += 1

        # try to clean up original accessions
        if CLEAN_ACCESSIONS:
            if prot.accession.startswith('gi|'):
                prot.parseNCBI(REF_SEQ_ONLY)
            elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'):
                prot.parseUniProt(KEEP_UNIPROT_ID)
            elif prot.accession.startswith('IPI:'):
                prot.parseIPI(KEEP_IPI_GENE_ID)
            else:
                pass

        # add old accession to description and make new accession
        # write original sequence to "forward" file and reversed to "reverse"
        prot.new_desc = '[%s] %s' % (prot.new_acc, prot.new_desc)
        prot.new_acc = 'EXTRA_%04d' % (pcount,)
        prot.accession = 'EXTRA_%04d' % (pcount,)
        prot.printProtein(for_file_obj)
        rev = prot.reverseProtein(decoy_string)
        rev.printProtein(rev_file_obj)
    for obj in write:
        print('...there were %s extra sequences in %s' % (pcount, os.path.split(extra_file)[1]), file=obj)

    # now add the contaminants
    try:
        if os.path.exists(CONTAMS):
            contams_file = CONTAMS
        else:
            path = os.path.split(fasta_file)[0]
            contams_file = os.path.join(path, CONTAMS)
        f = fasta_lib.FastaReader(contams_file)
        contams = 0
        while f.readNextProtein(prot, check_for_errs=True):
            pcount += 1
            contams += 1
            if CLEAN_ACCESSIONS:
                prot.parseCONT()

            # write sequences to respective files
            prot.printProtein(for_file_obj)
            rev = prot.reverseProtein(decoy_string)
            rev.printProtein(rev_file_obj)
        for obj in write:
            print('...there were %s contaminant entries in %s' % (contams, contams_file), file=obj)
    except:
        for obj in write:
            print('...WARNING:', CONTAMS, 'not found!', file=obj)

    # read proteins, clean up accessions, decriptions until EOF
    # write proteins to "forward" and "reversed" files
    f = fasta_lib.FastaReader(fasta_file)

    # checking for errors can slow program execution by factor of 3-4
    # Reading and writing sequences will always remove spaces and blank lines
    while f.readNextProtein(prot, check_for_errs=False):
        pcount += 1
        if CLEAN_ACCESSIONS:
            if prot.accession.startswith('gi|'):
                prot.parseNCBI(REF_SEQ_ONLY)
            elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'):
                prot.parseUniProt(KEEP_UNIPROT_ID)
            elif prot.accession.startswith('IPI:'):
                prot.parseIPI(KEEP_IPI_GENE_ID)
            else:
                pass

        prot.printProtein(for_file_obj)    # write to "forward" file
        rev = prot.reverseProtein(decoy_string)
        rev.printProtein(rev_file_obj)   # write to "reversed" file

    # make concatenated output file if desired and print summary stats
    if MAKE_SEPARATE_BOTH:
        both_name = _file.replace('.fasta', '_both.fasta')
        both_file_obj = open(both_name, 'w')
        for_file_obj.close()
        for_file_obj = open(for_name, 'r')
        rev_file_obj.close()
        rev_file_obj = open(rev_name, 'r')
        while True:
            line = for_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        while True:
            line = rev_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        both_file_obj.close()
        for obj in write:
            print('...%s total proteins written to %s' % (2*pcount, os.path.split(both_name)[1]), file=obj)

    if MAKE_SEPARATE_FORWARD:
        for obj in write:
            print('...%s proteins written to %s' % (pcount, os.path.split(for_name)[1]), file=obj)
    if MAKE_SEPARATE_REVERSED:
        for obj in write:
            print('...%s proteins reversed and written to %s' % (pcount, os.path.split(rev_name)[1]), file=obj)

    # close files and delete unwanted files
    for_file_obj.close()
    rev_file_obj.close()
    fasta_lib.time_stamp_logfile('>>> ending: add_extras_and_reverse.py', log_obj)
    log_obj.close()
    if not MAKE_SEPARATE_FORWARD:
        os.remove(for_name)
    if not MAKE_SEPARATE_REVERSED:
        os.remove(rev_name)
    return

Esempio n. 8

Mostra file

File: extract_by_string.py Progetto: smooshi/fasta_utilities

def main(string_dict):
    """Main program to extract entries containing strings from databases.
        Simple string search of pattern in combined accession/description lines.
        Logical OR if more than one pattern is mapped to the same outfile.
        Each matching protein is written once per output file with possible
            compound header (nr) of all headers containing matching patterns.
            If "cleaning" of accessions/descriptions is turned on for NCBI nr
            databases, only the first header element will be retained and
            any accession number cross-references will be lost.

    Written by Phil Wilmarth, OHSU, 2009.
    """
    print(
        '====================================================================='
    )
    print(
        ' extract_by_string.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '====================================================================='
    )

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    db_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'),
                                           ('Fasta files', '*.fasta')],
                                 title_string='Select a FASTSA database')
    if db_file == '': sys.exit()  # cancel button repsonse

    db_folder, db_name = os.path.split(db_file)
    base_name = db_name.replace('.gz', '')
    if not base_name.endswith('.fasta'):
        base_name = base_name + '.fasta'

    # create a log file to mirror screen output
    log_obj = open(os.path.join(db_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: extract_by_string.py',
                                 log_obj)

    # print the list of patterns that will be extracted
    string_list = list(string_dict.items())
    string_list.sort()
    for obj in write:
        print('...extracting entries containing these strings:', file=obj)
        for i, t in enumerate(string_list):
            print('......(%s) string "%s" to file ending in "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # open the output databases, initialize counters
    string_files = {}
    string_count = {}
    name_count = {}
    for string, name in string_dict.items():
        fname = base_name.replace('.fasta', '_' + name + '.fasta')
        fname = os.path.join(db_folder, fname)
        string_files[name] = fname
        string_count[string] = 0
        name_count[name] = 0
    for name in string_files.keys():
        string_files[name] = open(string_files[name], 'w')

    # create a FastaReader object, initialize counters, and start reading
    x = fasta_lib.FastaReader(db_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    for obj in write:
        print('...reading %s and extracting entries...' % (db_name, ),
              file=obj)
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 500000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        written = {}  # make sure protein is written only ONCE per OUTFILE
        header = prot.accession + ' ' + prot.description  # recreate the '>' line
        if not CASE_SENSITIVE:  # convert to uppercase
            header = header.upper()
        for pattern in string_dict.keys():
            new_pattern = pattern
            if not CASE_SENSITIVE:  # case insensitive matching
                new_pattern = new_pattern.upper()
            for head in header.split(chr(1)):  # check each header for matches
                if new_pattern in head:
                    name = string_dict[pattern]
                    name_header = written.get(name, '')
                    if name_header:
                        name_header = name_header + chr(1) + head
                        written[name] = name_header
                    else:
                        written[name] = head
                        string_count[pattern] += 1

        # write any matching proteins to appropriate out file
        for name in written.keys():
            name_count[name] += 1  # output file write counters
            f = string_files[name]  # output file pointers
            header = written[name]  # composite header of name's matches

            # set the accession and description fields before writing
            prot.accession = header.split()[0]
            prot.new_acc = prot.accession
            prot.description = header[(len(prot.accession) + 1):]
            prot.new_desc = prot.description
            if CLEAN_ACCESSIONS:
                if prot.accession.startswith('gi|'):
                    prot.parseNCBI(REF_SEQ_ONLY)
                elif prot.accession.startswith(
                        'sp|') or prot.accession.startswith('tr|'):
                    prot.parseUniProt(KEEP_UNIPROT_ID)
            prot.printProtein(f)  # write any matching proteins

    # close files
    for f in string_files.values():
        f.close()

    # print out the summary stuff
    for obj in write:
        print('...%s protein entries in %s' %
              ("{0:,d}".format(prot_read), db_name),
              file=obj)
        strings = list(string_count.keys())
        strings.sort()
        for i, string in enumerate(strings):
            print('......(%s) pattern "%s" was found in %s proteins' %
                  (i + 1, string, "{0:,d}".format(string_count[string])),
                  file=obj)
        print('...output file summaries...', file=obj)
        names = list(string_files.keys())
        names.sort()
        for i, name in enumerate(names):
            temp = base_name.replace('.fasta', '_' + name + '.fasta')
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]), temp),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: extract_by_string.py', log_obj)
    log_obj.close()
    return

Esempio n. 9

Mostra file

File: reverse_fasta.py Progetto: heejongkim/fasta_utilities

def main(fasta_file, forward=False, reverse=False, both=True, log_obj=None, contam_path=""):
    """Adds contaminants and reverses entries for a FASTA protein database.

    Call with single fasta file name.
    If "forward", make sequences plus contaminants,
    if "reverse", make reversed sequences with reversed contaminants,
    if "both", make concatenated target/decoy with contaminants.
    "contam_path" is optional fullpath name of a contaminants database to use instead of default
    """
    decoy_string = 'REV_'   # the string to denote decoy sequences
    ######################################
    # Change default contaminants file name here:
    CONTAMS = 'Thermo_contams.fasta'
    # or pass in a "contams_path"
    ######################################
    
    # open the "forward" and "reversed" output files
    if fasta_file.lower().endswith('.gz'):
        _file = os.path.splitext(fasta_file[:-3])[0]
    else:
        _file = os.path.splitext(fasta_file)[0]
    for_name = _file + '_for.fasta'
    for_file_obj = open(for_name, 'w')
    rev_name = _file + '_rev.fasta'
    rev_file_obj = open(rev_name, 'w')

    # create the name for the concatenated file (if later needed)
    both_name = _file + '_both.fasta'
    
    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    if not log_obj:
        log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: reverse_fasta.py', log_obj)
    
    # create instances protein object and initialize counter
    prot = fasta_lib.Protein()
    p_read = 0
    p_contam = 0

    # try to find the contaminants database file
    # If no contam file path provided, search for it in current directory
    _file = None
    if not contam_path:
        if os.path.exists(CONTAMS):
            _file = CONTAMS
        else:
            path = os.path.split(fasta_file)[0]
            if os.path.exists(os.path.join(path, CONTAMS)):
                _file = os.path.join(path, CONTAMS)
    elif os.path.exists(contam_path) and os.path.isfile(contam_path):
        _file = contam_path
    elif os.path.isdir(contam_path) and os.path.exists(os.path.join(contam_path, CONTAMS)):
        _file = os.path.join(contam_path, CONTAMS)
        
    # create reader and add contaminants (if contams file was found)
    if _file:
        f = fasta_lib.FastaReader(_file)
        while f.readNextProtein(prot, check_for_errs=True):
            p_contam += 1
            prot.printProtein(for_file_obj)
            rev = prot.reverseProtein(decoy_string)
            rev.printProtein(rev_file_obj)
        for obj in write:
            print('...there were %s contaminant entries in %s' %
                  ("{0:,d}".format(p_contam), os.path.split(_file)[1]), file=obj)
    else:        
        for obj in write:
            print('...WARNING: contaminants were not added', file=obj)
        
    # read proteins until EOF and write proteins to "forward" and "reversed" files
    f = fasta_lib.FastaReader(fasta_file)
    
    # error checking slows program execution, turn on if needed.
    # Reading and writing sequences always removes spaces and blank lines.
    while f.readNextProtein(prot, check_for_errs=False):
        p_read += 1
        prot.printProtein(for_file_obj)    # write to "forward" file
        rev = prot.reverseProtein(decoy_string)
        rev.printProtein(rev_file_obj)   # write to "reversed" file
    for_file_obj.close()
    rev_file_obj.close()
    for obj in write:
        print('...%s proteins read from %s' %
              ("{0:,d}".format(p_read), os.path.split(fasta_file)[1]), file=obj) 
    
    # make concatenated output file if desired and print summary stats
    if both:
        both_file_obj = open(both_name, 'w')
        for_file_obj = open(for_name, 'r')
        rev_file_obj = open(rev_name, 'r')
        while True:
            line = for_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        while True:
            line = rev_file_obj.readline()
            if not line: break
            both_file_obj.write(str(line))
        both_file_obj.close()
        for obj in write:
            print('...%s total proteins written to %s' %
                  ("{0:,d}".format(2*(p_contam+p_read)), os.path.split(both_name)[1]), file=obj)
    
    if forward:
        for obj in write:
            print('...%s proteins written to %s' %
                  ("{0:,d}".format(p_contam+p_read), os.path.split(for_name)[1]), file=obj)
    if reverse:
        for obj in write:
            print('...%s proteins reversed and written to %s' %
                  ("{0:,d}".format(p_contam+p_read), os.path.split(rev_name)[1]), file=obj)
    
    # close files and delete unwanted files
    for_file_obj.close()
    rev_file_obj.close()
    fasta_lib.time_stamp_logfile('>>> ending: reverse_fasta.py', log_obj)
    log_obj.close()
    if not forward:
        os.remove(for_name)
    if not reverse:
        os.remove(rev_name)
    return

Esempio n. 10

Mostra file

File: count_deluxe_fasta.py Progetto: pwilmart/fasta_utilities

def fasta_counter(fasta_file):
    """Counts entries in a FASTA protein database.
        Call with FASTA filename.
        Checks for duplicate accessions and valid aa characters.
        Computes protein sequence lengths, molecular weights - writes to TXT file (with DB basename)
    """
    # create a log file to mirror screen output
    _folder = os.path.split(fasta_file)[0]
    log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj)

    # create instances of reader object and protein object, initialize counters
    f = fasta_lib.FastaReader(fasta_file)
    p = fasta_lib.Protein()
    prot = 0
    head = 0
    conflict = {}

    # construct summary file name based on FASTA name
    if fasta_file.endswith('.fasta.gz'):
        summary_file = re.sub(r'.fasta.gz$', r'.txt', fasta_file)
    else:
        summary_file = re.sub(r'.fasta$', r'.txt', fasta_file)
    if summary_file == fasta_file:
        summary_file = fasta_file + '.txt'

    # open summary file and write header
    summary_obj = open(summary_file, mode='wt')
    summary_obj.write('Accession\tLength\tMW\n')

    # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4
    while f.readNextProtein(p, check_for_errs=True):

        # count protein sequences
        prot += 1
        if (prot % 500000) == 0:
            print('......(%s proteins read...)' % ("{0:,d}".format(prot), ))

        # check for duplicate accession
        dup = conflict.get(p.accession, False)
        if dup:
            for obj in write:
                print('\n...WARNING: %s is already in FASTA database!\n' %
                      (p.accession, ),
                      file=obj)
                if p.molwtProtein(show_errs=False) == conflict[p.accession]:
                    print('......possible duplicated sequence...', file=obj)
        else:
            conflict[p.accession] = p.molwtProtein(show_errs=False)

        # count number of header elements
        control_A = p.description.count(chr(1))
        head = head + control_A + 1

        # add info to summary_file
        print('\t'.join([
            p.accession,
            str(p.seqlenProtein()),
            str(round(p.molwtProtein(), 1))
        ]),
              file=summary_obj)

    # print results and return
    for obj in write:
        print('...there are %s proteins in %s' %
              ("{0:,d}".format(prot), os.path.split(fasta_file)[1]),
              file=obj)
        if head > prot:
            print('...there were %s header lines' % ("{0:,d}".format(head), ),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj)
    log_obj.close()
    summary_obj.close()
    return

Esempio n. 11

Mostra file

def main(taxon_dict):
    """Main program to extract entries by taxon ID from NCBI nr databases.
        Each gi number (of each header) is looked up to find associated taxon
        number for comparison to desired taxon numbers.  A separate protein
        entry will be written for each desired taxon number even if all taxon
        numbers are written to the same output file.  At the protein level, the
        extracted databases may no longer be non-redundant.  If "cleaning" of
        accessions/descriptions is turned off, all headers matching the desired
        taxon numbers will be added to the respective protein preserving the
        usual NCBI nr formatting structure.  If cleaning of accessions is turned
        on during extraction, some information may be lost.  This could make
        subsequent database processing (such as extracting by text string) fail.
        Cleaning is best done as a last step (i.e. in "reverse_fasta.py").
    """
    print(
        '====================================================================')
    print(
        ' nr_extract_taxon.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print(
        '====================================================================')

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    nr_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'),
                                           ('Fasta files', '*.fasta')],
                                 title_string='Select an NCBI nr database')
    if nr_file == '': sys.exit()  # cancel button response

    ncbi_folder, nr_name = os.path.split(nr_file)
    nr_db = os.path.splitext(nr_name)[0]

    # create a log file to mirror screen output
    log_obj = open(os.path.join(ncbi_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: nr_extract_taxon.py',
                                 log_obj)

    # get the saved gi number to taxon number {int:int} dictionary
    acc_to_taxon = fasta_lib.AccToTaxon(ncbi_folder)
    acc_to_taxon.create_or_load(ncbi_folder)

    # print the list of taxon numbers that will be extracted
    original_dict = taxon_dict
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers.  NOTE: if a taxon number appears in
    # "nr_fasta_analyze.txt", it will not be expanded.  Either delete the
    # line in "nr_fasta_analyze.txt", or make an expanded "taxon_dict" by hand.
    if EXPAND_GROUPS:
        fasta_lib.expand_species(ncbi_folder, 'nr', taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT,
                                 REF_SEQ_ONLY)

    # open the output databases, initialize counters, etc.
    taxon_files = {}
    taxon_count = {}
    name_count = {}
    for taxon, name in taxon_dict.items():
        fname = nr_db + '_' + name + '.fasta'
        fname = os.path.join(ncbi_folder, fname)
        taxon_files[name] = fname
        name_count[name] = 0
        taxon_count[taxon] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # loop over all proteins in nr
    x = fasta_lib.FastaReader(nr_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    not_found = 0
    skipped = 0
    for obj in write:
        print('...reading %s and extracting entries...' % (nr_name, ),
              file=obj)

    # checking for errors slows down program by about a factor of 3 or 4
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 1000000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        written = {}
        line = prot.accession + ' ' + prot.description
        prot.new_desc = ''

        # extract the gi numbers for each header
        for header in line.split(chr(1)):
            accession_with_version = header.split()[0]
            accession = accession_with_version.split('.')[0]
            if REF_SEQ_ONLY and '_' not in accession:
                continue  # skip proteins without RefSeq entries
            taxon = acc_to_taxon.get(accession, False)

            # see if taxon number for this gi is in our desired list
            if taxon:
                if taxon_dict.get(taxon, False):
                    if written.get(taxon, False):
                        # if taxon number already seen, add to header
                        prot = written[taxon]
                        prot.description = prot.description + chr(1) + header
                        written[taxon] = copy.deepcopy(prot)
                    else:
                        # first time taxon number seen
                        name = taxon_dict[taxon]
                        prot.accession = header.split()[0]
                        prot.description = header[len(prot.accession) + 1:]
                        prot.description = prot.description.rstrip()
                        taxon_count[taxon] += 1
                        name_count[name] += 1
                        written[taxon] = copy.deepcopy(prot)
                else:
                    skipped += 1
            else:
                not_found += 1
                continue

        # write a protein sequence for each taxon number it was matched to
        for taxon in written.keys():
            name = taxon_dict[taxon]
            f = taxon_files[name]
            prot = written[taxon]
            prot.new_desc = prot.description
            prot.new_acc = prot.accession
            if CLEAN_ACCESSIONS:
                prot.parseNCBI(REF_SEQ_ONLY)
            prot.printProtein(f)

    # print out number of matches and close files
    for obj in write:
        print('...%s proteins in %s' % ("{0:,d}".format(prot_read), nr_name),
              file=obj)
        print('...%s accessions did not have known taxon numbers' %
              ("{0:,d}".format(not_found), ),
              file=obj)
        print('...%s accessions were skipped (not in our taxon list)' %
              ("{0:,d}".format(skipped), ),
              file=obj)
        if REF_SEQ_ONLY:
            print('...Extracted sequences are RefSeq Only!!!', file=obj)
        if VERBOSE:
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon number %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...output file summaries...', file=obj)
        names = list(taxon_files.keys())
        names.sort()
        for i, name in enumerate(names):
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(
                      name_count[name]), nr_db + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: nr_extract_taxon.py', log_obj)
    log_obj.close()
    for f in taxon_files.values():
        f.close()
    return

Esempio n. 12

Mostra file

File: uniprot_extract_from_one.py Progetto: smooshi/fasta_utilities

def main(taxon_dict):
    """Main program to extract entries by taxon ID from uniprot databases.
    Extraction is from a single downloaded Sprot or Trembl database.
    """
    print(
        '============================================================================'
    )
    print(
        ' uniprot_extract_from_one.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '============================================================================'
    )

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    uniprot_file = fasta_lib.get_file(
        default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')],
        title_string='Select an Sprot or Trembl database')
    if uniprot_file == '': sys.exit()  # cancel button repsonse

    uniprot_folder, uniprot_name = os.path.split(uniprot_file)
    version = uniprot_name.split('_')[-1]
    version = version.replace('.fasta.gz', '')
    uniprot_db = uniprot_name.split('_')[1]

    # create a log file to mirror screen output
    log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: uniprot_extract_from_one.py',
                                 log_obj)

    # make the smaller uniprot dictionaries
    (sci_to_taxon,
     id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder)

    # make the more complete dictionary
    name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder)

    # print the list of taxon numbers that will be extracted
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers
    # NOTE: Any taxon numbers present in analysis text file will not be expanded.
    if EXPAND_GROUPS:
        fasta_lib.expand_species(uniprot_folder, uniprot_db, taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT)

    # inititalize dictionaries and counters
    taxon_files, taxon_count, name_count = {}, {}, {}
    for taxon, name in taxon_dict.items():
        fname = uniprot_db + '_' + version + '_' + name + '.fasta'
        fname = os.path.join(uniprot_folder, fname)
        taxon_files[name] = fname
        taxon_count[taxon] = 0
        name_count[name] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # create a FastaReader object, initialize counters, and start reading
    x = fasta_lib.FastaReader(uniprot_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    not_found = 0
    duplicates = {}
    for obj in write:
        print('...reading %s and extracting entries...' % (uniprot_name, ),
              file=obj)

    # checking for errors in sequences slows program execution, use as needed
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 500000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        (spec_id,
         spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' +
                                                   prot.description)
        taxon = sci_to_taxon.get(spec_name, 0)  # first choice mapping
        taxon2 = name_to_taxon.get(spec_name, 0)  # alternative mapping
        if taxon == 0:  # first choice not present
            if taxon2 == 0:
                not_found += 1
            else:
                taxon = taxon2  # use second choice
        else:
            if (taxon != taxon2) and (
                    taxon2 > 0):  #keep track of multiple taxon numbers
                duplicates[spec_name] = (taxon, taxon2)
        if taxon_dict.get(taxon, False):
            if CLEAN_ACCESSIONS:
                prot.parseUniProt()

            # taxon number matches, so write the protein to the respective file
            name = taxon_dict[taxon]
            name_count[name] += 1
            taxon_count[taxon] += 1
            f = taxon_files[name]
            prot.printProtein(f)

    # close the extracted database files
    for f in taxon_files.values():
        f.close()

    # print list of mis-matching taxon number warnings
    if MISMATCHES:
        for i, (name, pair) in enumerate(duplicates.items()):
            for obj in write:
                print('......(%s) WARNING: %s and %s map to "%s"' %
                      (i + 1, pair[0], pair[1], name),
                      file=obj)

    # print out the summary stuff
    for obj in write:
        print('...%s protein entries in %s' %
              ("{0:,d}".format(prot_read), uniprot_name),
              file=obj)
        print('...%s proteins had unknown taxon numbers' % (not_found, ),
              file=obj)
        if VERBOSE:
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...output file summaries...', file=obj)
        names = list(taxon_files.keys())
        names.sort()
        for i, name in enumerate(names):
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]),
                   uniprot_db + '_' + version + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_one.py',
                                 log_obj)
    log_obj.close()
    return