Ejemplos de get_file en Python, ejemplos de fasta_lib.get_file en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: check_for_duplicates.py Proyecto: smooshi/fasta_utilities

    return (candidates, index)
    # end


# setup stuff: check for command line args, etc.
if __name__ == '__main__':

    # check if database name passed on command line
    if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
        fasta_file = sys.argv[1]

    # if not, browse to database file
    else:
        if len(sys.argv) > 1:
            print('...WARNING: %s not found...' % (sys.argv[1], ))
        database = r'C:\Xcalibur\database'
        if not os.path.exists(database):
            database = os.getcwd()
        fasta_file = fasta_lib.get_file(database,
                                        [('FASTA files', '*.fasta'),
                                         ('Zipped FASTA files', '*.gz'),
                                         ('All files', '*.*')],
                                        'Select a FASTA database')
        if fasta_file == '': sys.exit()  # cancel button repsonse

    # call main function
    candidates, index = main(fasta_file)

# end

Ejemplo n.º 2

0

Mostrar archivo

# updated for Python 3 -PW 7/6/2017

import os
import copy
import fasta_lib

# print program name and version
print('============================================================')
print(' program TriTryp_fixer.py, v1.0.2, Phil Wilmarth, OHSU 2017 ')
print('============================================================')

# browse to the database
database = r"C:\Xcalibur\database"
if not os.path.exists(database):
    database = os.getcwd()
fasta_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta')],
                                'Select a TriTryp FASTA database')
if fasta_file == '': sys.exit()  # cancel button repsonse

# build new database name
new_fasta_file = os.path.basename(fasta_file)
new_fasta_file = new_fasta_file.replace('.fasta', '_fixed.fasta')
new_fasta_file = os.path.join(os.path.dirname(fasta_file), new_fasta_file)

# initializations
proteins = []
p = fasta_lib.Protein()
pcount = 0
stop_count = 0
gap_count = 0
no_met = 0

Ejemplo n.º 3

0

Mostrar archivo

Archivo: UniProt_reference_proteome_manager.py Proyecto: smooshi/Reference_Proteome_Manager

 def select_defaults_and_load(self):
     """Let user browse to a defaults file and load the species."""
     self.selected_default = fasta_lib.get_file(
         self.script_path, [('Text files', '*.txt')],
         'Select a default species list file')
     self.load_defaults()

Ejemplo n.º 4

0

Mostrar archivo

OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

Written by Phil Wilmarth, OHSU, 2016
"""
import os
import sys
import time
import copy
import fasta_lib

# control flags
KEEP_CONTAMS = False

# get the files, etc.
list_file_name = fasta_lib.get_file(os.getcwd(), [('Text files', '*.txt')],
                                    'Browse to accession list text file')
if not list_file_name: sys.exit()
results_location = os.path.split(list_file_name)[0]
database_name = fasta_lib.get_file(r'C:\Xcalibur\database',
                                   [('FASTA files', '*.fasta')],
                                   'Select the database')
if not database_name: sys.exit()
new_name = os.path.split(database_name)[1]
new_name = os.path.splitext(new_name)[0]
subset_DB_name = fasta_lib.save_file(results_location,
                                     [('FASTA files', '*.fasta')],
                                     default_file=new_name + '_subset.fasta',
                                     title_string='Name of subset database')
if not subset_DB_name: sys.exit()
if os.path.splitext(subset_DB_name)[1] == '':
    subset_DB_name += '.fasta'

Ejemplo n.º 5

0

Mostrar archivo

Archivo: taxon_group_analyzer.py Proyecto: smooshi/fasta_utilities

def main(node_taxon):
    """Program to process taxonomy nodes file and find groups of species.
    """
    print(
        '======================================================================='
    )
    print(
        ' taxon_group_analyzer.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '======================================================================='
    )

    # get the name of the database analysis text file
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    ext_list = [('Text files', '*.txt'), ('All files', '*.*')]
    analysis_file = fasta_lib.get_file(default, ext_list,
                                       'Select a species analysis file')
    if analysis_file == '': sys.exit()  # cancel button response

    analysis_folder, short_file = os.path.split(analysis_file)
    print('...making taxonomy nodes dictionary...')

    # may need to check if this works in Python 3
    archive_name = os.path.join(analysis_folder, 'taxdump.tar.gz')
    archive = tarfile.open(archive_name)
    nodes = archive.extractfile('nodes.dmp')

    # read file and save taxon to parent taxon mappings
    taxon_to_parent = {}
    while True:
        line = nodes.readline()
        line = line.decode('utf-8')
        line = line.rstrip()
        if not line:
            break
        else:
            line = line.rstrip()
        item = line.split('\t|\t')
        taxon_to_parent[int(item[0])] = int(item[1])
    nodes.close()

    # open the fasta_analysis.txt file and find group members
    print('...scanning %s file...' % (short_file, ))
    fasta_analyze = open(analysis_file, 'r')
    out_name = analysis_file.replace('.txt', '_' + str(node_taxon) + '.txt')
    out_file = open(out_name, 'w')
    line = fasta_analyze.readline().rstrip()
    print('Analysis of node:', node_taxon, file=out_file)
    line = line.replace('A2:', 'A3:')
    print(line, file=out_file)
    member = 0
    while True:
        line = fasta_analyze.readline()  # read analyze text file line
        if not line:
            break
        else:
            line = line.rstrip()
        tree = []  # list of taxon number lineage
        parent = line.split('\t')[1]
        try:
            parent = int(parent)
        except:
            continue
        while parent != 1:  # all lineages end with taxon=1
            tree.append(parent)
            try:
                parent = taxon_to_parent[parent]
            except KeyError:
                break
        tree.append(1)  # add last lineage item
        if node_taxon in tree:  # see if desired node is anywhere in the list
            member += 1
            print(line, file=out_file)  # write lines of node members
    #
    fasta_analyze.close()
    out_file.close()
    print('...taxonomy node %s had %s members...' % (node_taxon, member))
    return

Ejemplo n.º 6

0

Mostrar archivo

Archivo: UniProt_reference_proteome_manager.py Proyecto: smooshi/Reference_Proteome_Manager

 def browse_contams(self):
     """Dialog to browse to non-default contaminants database."""
     self.contams_database = fasta_lib.get_file(
         self.script_path, [('Fasta files', '*.fasta')],
         "Select a contaminants FASTA file")
     self.contams_label.config(text=os.path.split(self.contams_database)[1])

Ejemplo n.º 7

0

Mostrar archivo

Archivo: add_extras_and_reverse.py Proyecto: pwilmart/fasta_utilities

        if os.path.exists(sys.argv[3]):
            output_file = sys.argv[3]

    # if not, get database files with dialog boxes
    else:
        if len(sys.argv) > 1:
            for i, db in enumerate([extra_file, fasta_file, output_file]):
                if not db:
                    print('...WARNING: %s not found...' % (sys.argv[i+1],))
        database = r'C:\Xcalibur\database'
        if not os.path.exists(database):
            database = os.getcwd()

        print('Select the FASTA file with extra sequences')
        extra_file = fasta_lib.get_file(database,
                                        [('FASTA files', '*.fasta'), ('All files', '*.*')],
                                        'Select Extra Sequences (FASTA format)')
        if extra_file == '': sys.exit() # cancel button response

        extra_name = os.path.split(extra_file)[1]
        extra_name = extra_name.split('.fasta')[0]
        print('Select the main FASTA file')
        fasta_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta'),
                                                   ('GZipped files', '*.gz'),
                                                   ('All files', '*.*')],
                                        'Select FASTA database file')
        if fasta_file == '': sys.exit() # cancel button response

        default = os.path.split(fasta_file)[0]
        fasta_name = os.path.split(fasta_file)[1]
        default_file = extra_name + '_' + fasta_name

Ejemplo n.º 8

0

Mostrar archivo

Archivo: extract_by_string.py Proyecto: smooshi/fasta_utilities

def main(string_dict):
    """Main program to extract entries containing strings from databases.
        Simple string search of pattern in combined accession/description lines.
        Logical OR if more than one pattern is mapped to the same outfile.
        Each matching protein is written once per output file with possible
            compound header (nr) of all headers containing matching patterns.
            If "cleaning" of accessions/descriptions is turned on for NCBI nr
            databases, only the first header element will be retained and
            any accession number cross-references will be lost.

    Written by Phil Wilmarth, OHSU, 2009.
    """
    print(
        '====================================================================='
    )
    print(
        ' extract_by_string.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '====================================================================='
    )

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    db_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'),
                                           ('Fasta files', '*.fasta')],
                                 title_string='Select a FASTSA database')
    if db_file == '': sys.exit()  # cancel button repsonse

    db_folder, db_name = os.path.split(db_file)
    base_name = db_name.replace('.gz', '')
    if not base_name.endswith('.fasta'):
        base_name = base_name + '.fasta'

    # create a log file to mirror screen output
    log_obj = open(os.path.join(db_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: extract_by_string.py',
                                 log_obj)

    # print the list of patterns that will be extracted
    string_list = list(string_dict.items())
    string_list.sort()
    for obj in write:
        print('...extracting entries containing these strings:', file=obj)
        for i, t in enumerate(string_list):
            print('......(%s) string "%s" to file ending in "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # open the output databases, initialize counters
    string_files = {}
    string_count = {}
    name_count = {}
    for string, name in string_dict.items():
        fname = base_name.replace('.fasta', '_' + name + '.fasta')
        fname = os.path.join(db_folder, fname)
        string_files[name] = fname
        string_count[string] = 0
        name_count[name] = 0
    for name in string_files.keys():
        string_files[name] = open(string_files[name], 'w')

    # create a FastaReader object, initialize counters, and start reading
    x = fasta_lib.FastaReader(db_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    for obj in write:
        print('...reading %s and extracting entries...' % (db_name, ),
              file=obj)
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 500000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        written = {}  # make sure protein is written only ONCE per OUTFILE
        header = prot.accession + ' ' + prot.description  # recreate the '>' line
        if not CASE_SENSITIVE:  # convert to uppercase
            header = header.upper()
        for pattern in string_dict.keys():
            new_pattern = pattern
            if not CASE_SENSITIVE:  # case insensitive matching
                new_pattern = new_pattern.upper()
            for head in header.split(chr(1)):  # check each header for matches
                if new_pattern in head:
                    name = string_dict[pattern]
                    name_header = written.get(name, '')
                    if name_header:
                        name_header = name_header + chr(1) + head
                        written[name] = name_header
                    else:
                        written[name] = head
                        string_count[pattern] += 1

        # write any matching proteins to appropriate out file
        for name in written.keys():
            name_count[name] += 1  # output file write counters
            f = string_files[name]  # output file pointers
            header = written[name]  # composite header of name's matches

            # set the accession and description fields before writing
            prot.accession = header.split()[0]
            prot.new_acc = prot.accession
            prot.description = header[(len(prot.accession) + 1):]
            prot.new_desc = prot.description
            if CLEAN_ACCESSIONS:
                if prot.accession.startswith('gi|'):
                    prot.parseNCBI(REF_SEQ_ONLY)
                elif prot.accession.startswith(
                        'sp|') or prot.accession.startswith('tr|'):
                    prot.parseUniProt(KEEP_UNIPROT_ID)
            prot.printProtein(f)  # write any matching proteins

    # close files
    for f in string_files.values():
        f.close()

    # print out the summary stuff
    for obj in write:
        print('...%s protein entries in %s' %
              ("{0:,d}".format(prot_read), db_name),
              file=obj)
        strings = list(string_count.keys())
        strings.sort()
        for i, string in enumerate(strings):
            print('......(%s) pattern "%s" was found in %s proteins' %
                  (i + 1, string, "{0:,d}".format(string_count[string])),
                  file=obj)
        print('...output file summaries...', file=obj)
        names = list(string_files.keys())
        names.sort()
        for i, name in enumerate(names):
            temp = base_name.replace('.fasta', '_' + name + '.fasta')
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]), temp),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: extract_by_string.py', log_obj)
    log_obj.close()
    return

Ejemplo n.º 9

0

Mostrar archivo

OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE."""

import os
import sys
import copy
import fasta_lib

# bit scores of 75-100 seem to be above random scores
MIN_BIT = 100.0

# get the two PAW results databases to blast against each other
default = os.getcwd()

print('Select the BLAST mapping file file')
anno_file = fasta_lib.get_file(default, [('TXT files', '*.txt')], 'Select mapping file')
if not anno_file:
    sys.exit()     # cancel button was hit
    
default = os.path.dirname(anno_file)
print('Select the FASTA file')
orig_database = fasta_lib.get_file(default, [('FASTA files', '*.fasta')], 'Select database')
if not orig_database:
    sys.exit()    # cancel button was hit

new_database = orig_database.replace('.fasta', '_fixed.fasta')

# echo database names to console output
print('Mapping file:', os.path.basename(anno_file))
print('Original database:', os.path.basename(orig_database))
print('New database:', os.path.basename(new_database))

Ejemplo n.º 10

0

Mostrar archivo

def main(taxon_dict):
    """Main program to extract entries by taxon ID from NCBI nr databases.
        Each gi number (of each header) is looked up to find associated taxon
        number for comparison to desired taxon numbers.  A separate protein
        entry will be written for each desired taxon number even if all taxon
        numbers are written to the same output file.  At the protein level, the
        extracted databases may no longer be non-redundant.  If "cleaning" of
        accessions/descriptions is turned off, all headers matching the desired
        taxon numbers will be added to the respective protein preserving the
        usual NCBI nr formatting structure.  If cleaning of accessions is turned
        on during extraction, some information may be lost.  This could make
        subsequent database processing (such as extracting by text string) fail.
        Cleaning is best done as a last step (i.e. in "reverse_fasta.py").
    """
    print(
        '====================================================================')
    print(
        ' nr_extract_taxon.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ')
    print(
        '====================================================================')

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    nr_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'),
                                           ('Fasta files', '*.fasta')],
                                 title_string='Select an NCBI nr database')
    if nr_file == '': sys.exit()  # cancel button response

    ncbi_folder, nr_name = os.path.split(nr_file)
    nr_db = os.path.splitext(nr_name)[0]

    # create a log file to mirror screen output
    log_obj = open(os.path.join(ncbi_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: nr_extract_taxon.py',
                                 log_obj)

    # get the saved gi number to taxon number {int:int} dictionary
    acc_to_taxon = fasta_lib.AccToTaxon(ncbi_folder)
    acc_to_taxon.create_or_load(ncbi_folder)

    # print the list of taxon numbers that will be extracted
    original_dict = taxon_dict
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers.  NOTE: if a taxon number appears in
    # "nr_fasta_analyze.txt", it will not be expanded.  Either delete the
    # line in "nr_fasta_analyze.txt", or make an expanded "taxon_dict" by hand.
    if EXPAND_GROUPS:
        fasta_lib.expand_species(ncbi_folder, 'nr', taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT,
                                 REF_SEQ_ONLY)

    # open the output databases, initialize counters, etc.
    taxon_files = {}
    taxon_count = {}
    name_count = {}
    for taxon, name in taxon_dict.items():
        fname = nr_db + '_' + name + '.fasta'
        fname = os.path.join(ncbi_folder, fname)
        taxon_files[name] = fname
        name_count[name] = 0
        taxon_count[taxon] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # loop over all proteins in nr
    x = fasta_lib.FastaReader(nr_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    not_found = 0
    skipped = 0
    for obj in write:
        print('...reading %s and extracting entries...' % (nr_name, ),
              file=obj)

    # checking for errors slows down program by about a factor of 3 or 4
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 1000000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        written = {}
        line = prot.accession + ' ' + prot.description
        prot.new_desc = ''

        # extract the gi numbers for each header
        for header in line.split(chr(1)):
            accession_with_version = header.split()[0]
            accession = accession_with_version.split('.')[0]
            if REF_SEQ_ONLY and '_' not in accession:
                continue  # skip proteins without RefSeq entries
            taxon = acc_to_taxon.get(accession, False)

            # see if taxon number for this gi is in our desired list
            if taxon:
                if taxon_dict.get(taxon, False):
                    if written.get(taxon, False):
                        # if taxon number already seen, add to header
                        prot = written[taxon]
                        prot.description = prot.description + chr(1) + header
                        written[taxon] = copy.deepcopy(prot)
                    else:
                        # first time taxon number seen
                        name = taxon_dict[taxon]
                        prot.accession = header.split()[0]
                        prot.description = header[len(prot.accession) + 1:]
                        prot.description = prot.description.rstrip()
                        taxon_count[taxon] += 1
                        name_count[name] += 1
                        written[taxon] = copy.deepcopy(prot)
                else:
                    skipped += 1
            else:
                not_found += 1
                continue

        # write a protein sequence for each taxon number it was matched to
        for taxon in written.keys():
            name = taxon_dict[taxon]
            f = taxon_files[name]
            prot = written[taxon]
            prot.new_desc = prot.description
            prot.new_acc = prot.accession
            if CLEAN_ACCESSIONS:
                prot.parseNCBI(REF_SEQ_ONLY)
            prot.printProtein(f)

    # print out number of matches and close files
    for obj in write:
        print('...%s proteins in %s' % ("{0:,d}".format(prot_read), nr_name),
              file=obj)
        print('...%s accessions did not have known taxon numbers' %
              ("{0:,d}".format(not_found), ),
              file=obj)
        print('...%s accessions were skipped (not in our taxon list)' %
              ("{0:,d}".format(skipped), ),
              file=obj)
        if REF_SEQ_ONLY:
            print('...Extracted sequences are RefSeq Only!!!', file=obj)
        if VERBOSE:
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon number %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...output file summaries...', file=obj)
        names = list(taxon_files.keys())
        names.sort()
        for i, name in enumerate(names):
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(
                      name_count[name]), nr_db + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: nr_extract_taxon.py', log_obj)
    log_obj.close()
    for f in taxon_files.values():
        f.close()
    return

Ejemplo n.º 11

0

Mostrar archivo

Archivo: uniprot_extract_from_one.py Proyecto: smooshi/fasta_utilities

def main(taxon_dict):
    """Main program to extract entries by taxon ID from uniprot databases.
    Extraction is from a single downloaded Sprot or Trembl database.
    """
    print(
        '============================================================================'
    )
    print(
        ' uniprot_extract_from_one.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '============================================================================'
    )

    # set some file paths and names
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    uniprot_file = fasta_lib.get_file(
        default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')],
        title_string='Select an Sprot or Trembl database')
    if uniprot_file == '': sys.exit()  # cancel button repsonse

    uniprot_folder, uniprot_name = os.path.split(uniprot_file)
    version = uniprot_name.split('_')[-1]
    version = version.replace('.fasta.gz', '')
    uniprot_db = uniprot_name.split('_')[1]

    # create a log file to mirror screen output
    log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile('\n>>> starting: uniprot_extract_from_one.py',
                                 log_obj)

    # make the smaller uniprot dictionaries
    (sci_to_taxon,
     id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder)

    # make the more complete dictionary
    name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder)

    # print the list of taxon numbers that will be extracted
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers
    # NOTE: Any taxon numbers present in analysis text file will not be expanded.
    if EXPAND_GROUPS:
        fasta_lib.expand_species(uniprot_folder, uniprot_db, taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT)

    # inititalize dictionaries and counters
    taxon_files, taxon_count, name_count = {}, {}, {}
    for taxon, name in taxon_dict.items():
        fname = uniprot_db + '_' + version + '_' + name + '.fasta'
        fname = os.path.join(uniprot_folder, fname)
        taxon_files[name] = fname
        taxon_count[taxon] = 0
        name_count[name] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # create a FastaReader object, initialize counters, and start reading
    x = fasta_lib.FastaReader(uniprot_file)
    prot = fasta_lib.Protein()
    prot_read = 0
    not_found = 0
    duplicates = {}
    for obj in write:
        print('...reading %s and extracting entries...' % (uniprot_name, ),
              file=obj)

    # checking for errors in sequences slows program execution, use as needed
    while x.readNextProtein(prot, check_for_errs=False):
        prot_read += 1
        if (prot_read % 500000) == 0:
            print('......(%s proteins read...)' %
                  ("{0:,d}".format(prot_read), ))
        (spec_id,
         spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' +
                                                   prot.description)
        taxon = sci_to_taxon.get(spec_name, 0)  # first choice mapping
        taxon2 = name_to_taxon.get(spec_name, 0)  # alternative mapping
        if taxon == 0:  # first choice not present
            if taxon2 == 0:
                not_found += 1
            else:
                taxon = taxon2  # use second choice
        else:
            if (taxon != taxon2) and (
                    taxon2 > 0):  #keep track of multiple taxon numbers
                duplicates[spec_name] = (taxon, taxon2)
        if taxon_dict.get(taxon, False):
            if CLEAN_ACCESSIONS:
                prot.parseUniProt()

            # taxon number matches, so write the protein to the respective file
            name = taxon_dict[taxon]
            name_count[name] += 1
            taxon_count[taxon] += 1
            f = taxon_files[name]
            prot.printProtein(f)

    # close the extracted database files
    for f in taxon_files.values():
        f.close()

    # print list of mis-matching taxon number warnings
    if MISMATCHES:
        for i, (name, pair) in enumerate(duplicates.items()):
            for obj in write:
                print('......(%s) WARNING: %s and %s map to "%s"' %
                      (i + 1, pair[0], pair[1], name),
                      file=obj)

    # print out the summary stuff
    for obj in write:
        print('...%s protein entries in %s' %
              ("{0:,d}".format(prot_read), uniprot_name),
              file=obj)
        print('...%s proteins had unknown taxon numbers' % (not_found, ),
              file=obj)
        if VERBOSE:
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...output file summaries...', file=obj)
        names = list(taxon_files.keys())
        names.sort()
        for i, name in enumerate(names):
            print('......(%s) %s proteins extracted and written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]),
                   uniprot_db + '_' + version + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_one.py',
                                 log_obj)
    log_obj.close()
    return