return


if __name__ == '__main__':
    # get the path to nr.gz and call main function to download, etc.

    # check if nr.gz file path is passed on command line
    if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
        selection = sys.argv[1]

    # if not, browse to a folder for nr downloads
    else:
        database = r'C:\Xcalibur\database'
        if not os.path.exists(database):
            database = os.getcwd()
        selection = fasta_lib.get_folder(database,
                                         'Select folder for nr downloads')
        if selection == '': sys.exit()  # cancel button response

    # if folder name starts with 'nr_', then skip creating a new folder
    if os.path.split(selection)[1].startswith('nr_'):
        main('nr', selection)

    # otherwise create a new folder with date stamp
    else:
        curr_time = time.localtime()
        curr_date = '%4d%02d%02d' % (curr_time[0], curr_time[1], curr_time[2])
        folder = os.path.join(selection, 'nr_' + curr_date)
        if not os.path.exists(folder):
            os.mkdir(folder)
        main('nr', folder)
def main(taxon_dict):
    """Extracts entries by taxon ID from both Sprot and Trembl databases.
    """
    print(
        '============================================================================='
    )
    print(
        ' uniprot_extract_from_both.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 '
    )
    print(
        '============================================================================='
    )

    # get the UniProt folder and then get the sprot and trembl database names
    DB = []
    default = r'C:\Xcalibur\database'
    if not os.path.exists(default):
        default = os.getcwd()
    uniprot_folder = fasta_lib.get_folder(
        default, title_string='Select a UniProt download folder')
    if uniprot_folder == '': sys.exit()  # cancel button response

    version = uniprot_folder.split('_')[-1]
    uniprot_db = 'uniprot'
    for files in os.listdir(uniprot_folder):
        if files.startswith('uniprot_') and files.endswith('.gz'):
            DB.append(os.path.join(uniprot_folder, files))
    if len(DB) != 2:
        print('WARNING: either sprot or trembl DB was missing')

    # create a log file to mirror screen output
    log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a')
    write = [None, log_obj]
    fasta_lib.time_stamp_logfile(
        '\n>>> starting: uniprot_extract_from_both.py', log_obj)

    # make the smaller uniprot dictionaries
    (sci_to_taxon,
     id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder)

    # make the more complete dictionary
    name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder)

    # print the list of taxon numbers that will be extracted
    # NOTE: Any taxon numbers present in analysis text file will not be expanded.
    taxon_list = list(taxon_dict.items())
    taxon_list.sort()
    for obj in write:
        print('...extracting these taxon numbers:', file=obj)
        for i, t in enumerate(taxon_list):
            print('......(%s) taxon %s to file tagged with "%s"' %
                  (i + 1, t[0], t[1]),
                  file=obj)

    # expand any group taxon numbers
    if EXPAND_GROUPS:
        fasta_lib.expand_species(uniprot_folder, 'uniprot', taxon_dict,
                                 MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT)

    # inititalize dictionaries and counters
    taxon_files, taxon_count, name_count = {}, {}, {}
    for taxon, name in taxon_dict.items():
        fname = uniprot_db + '_' + version + '_' + name + '.fasta'
        fname = os.path.join(uniprot_folder, fname)
        taxon_files[name] = fname
        taxon_count[taxon] = 0
        name_count[name] = 0

    # open the output filenames
    for name in taxon_files.keys():
        taxon_files[name] = open(taxon_files[name], 'w')

    # want to count extracted sequences from each database
    name_counter = {}
    number_counter = {}

    # loop over both databases and extract species
    duplicates = {}
    for i in range(len(DB)):
        prot_read = 0
        not_found = 0
        for value in taxon_dict.values():
            name_counter[value] = 0
        for key in taxon_dict.keys():
            number_counter[key] = 0

        # create a FastaReader object, initialize counters, and start reading
        uniprot_file = DB[i]
        x = fasta_lib.FastaReader(uniprot_file)
        prot = fasta_lib.Protein()
        for obj in write:
            print('...reading %s and extracting entries...' %
                  (os.path.split(uniprot_file)[1], ),
                  file=obj)

        # NOTE: checking for errors will slow program execution, use if needed
        while x.readNextProtein(prot, check_for_errs=False):
            prot_read += 1
            if (prot_read % 500000) == 0:
                print('......(%s proteins read...)' %
                      ("{0:,d}".format(prot_read), ))
            (spec_id,
             spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' +
                                                       prot.description)
            taxon = sci_to_taxon.get(spec_name, 0)  # first choice mapping
            taxon2 = name_to_taxon.get(spec_name, 0)  # alternative mapping
            if taxon == 0:  # first choice not present
                if taxon2 == 0:
                    not_found += 1
                else:
                    taxon = taxon2  # use second choice
            else:
                if (taxon != taxon2) and (
                        taxon2 > 0):  # keep track of multiple taxon numbers
                    duplicates[spec_name] = (taxon, taxon2)
            if taxon_dict.get(taxon, False):
                if CLEAN_ACCESSIONS:
                    prot.parseUniProt()

                # taxon number matches, so write the protein to respective output file(s)
                name = taxon_dict[taxon]
                name_counter[name] += 1
                name_count[name] += 1
                taxon_count[taxon] += 1
                number_counter[taxon] += 1
                f = taxon_files[name]
                prot.printProtein(f)

        # print extraction stats for each database
        for obj in write:
            print('...%s protein entries in %s' %
                  ("{0:,d}".format(prot_read), os.path.split(DB[0])[1]),
                  file=obj)
            print('...%s proteins had unknown taxon numbers' %
                  ("{0:,d}".format(not_found), ),
                  file=obj)
            numbers = list(number_counter.keys())
            numbers.sort()
            if VERBOSE:
                for j, number in enumerate(numbers):
                    if number_counter[number] > 0:
                        print('......(%s) taxon %s had %s proteins' %
                              (j + 1, number, "{0:,d}".format(
                                  number_counter[number])),
                              file=obj)
            names = list(name_counter.keys())
            names.sort()
            db_name = os.path.split(DB[i])[1]
            for j, name in enumerate(names):
                print('......(%s) %s %s proteins extracted' %
                      (j + 1, "{0:,d}".format(name_counter[name]), name),
                      file=obj)

    # close the extracted database files
    for f in taxon_files.values():
        f.close()

    # print list of mis-matched taxon number warnings
    if MISMATCHES:
        for i, (name, pair) in enumerate(duplicates.items()):
            for obj in write:
                print('......(%s) WARNING: %s and %s map to "%s"' %
                      (i + 1, pair[0], pair[1], name),
                      file=obj)

    # print out the final summary stuff
    for obj in write:
        if VERBOSE:
            print('...combined taxon counts...', file=obj)
            numbers = list(taxon_count.keys())
            numbers.sort()
            for i, number in enumerate(numbers):
                if taxon_count[number] > 0:
                    print(
                        '......(%s) taxon %s had %s proteins' %
                        (i + 1, number, "{0:,d}".format(taxon_count[number])),
                        file=obj)
        print('...combined output file counts...', file=obj)
        for i, name in enumerate(names):
            print('......(%s) %s total proteins written to %s' %
                  (i + 1, "{0:,d}".format(name_count[name]),
                   uniprot_db + '_' + version + '_' + name + '.fasta'),
                  file=obj)

    fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_both.py',
                                 log_obj)
    log_obj.close()
    return
              species.X_count,
              file=obj)
        print("    translations that had Z (ambiguous Q/E):",
              species.Z_count,
              file=obj)

    return  # attributes added to species should be there after return


# print program name and version
print('===================================================================')
print(' program check_fasta_dir_walk.py, v1.0.0, Phil Wilmarth, OHSU 2020 ')
print('===================================================================')

# select a root folder
root_path = fasta_lib.get_folder(os.getcwd(), 'Select a Root folder')
if not root_path:
    sys.exit()  # cancel button repsonse

# create a log file to mirror screen output
_folder = root_path
log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a')
write = [None, log_obj]
fasta_lib.time_stamp_logfile('\n>>> starting: check_fasta.py', log_obj)

species_list = []
# process the FASTA files
for root, dirs, files in os.walk(root_path):
    for file in files:
        if file.endswith(".all.fa.gz"):
            species = Species()
    def download_databases(self):
        """Fetches the database files for the selected species."""
        self.login()  # Refresh the FTP connection

        # Throw warning if no databases selected
        if len(self.tree_right.get_children()) == 0:
            messagebox.showwarning("Empty Selection",
                                   "No databases were selected for download!")
            return None  # Exit function

        # Get parent folder location for database download
        self.abs_download_path = fasta_lib.get_folder(
            self.script_path, 'Select parent folder for database downloads')
        if not self.abs_download_path:
            return None

        # Make a separate folder to contain all files
        uniprot_dir_name = r"UniProt_{}".format(self.date)
        uniprot_dir_path = os.path.join(self.abs_download_path,
                                        uniprot_dir_name)
        try:
            os.mkdir(uniprot_dir_path)
        except FileExistsError:
            pass
        os.chdir(uniprot_dir_path)

        # Get taxonomy ID numbers for right (download) list
        tax_id_list = [
            self.tree_right.item(entry)['values'][0]
            for entry in self.tree_right.get_children()
        ]
        set_tax_id_list = list(set(tax_id_list))  # remove duplicates (if any)
        if len(tax_id_list) != len(set_tax_id_list):
            messagebox.showwarning(
                "Duplicates found!",
                "Duplicate databases were selected and will be ignored!")

        # Get the entry objects for the right taxonomy numbers
        download_entries = [
            entry for entry in self.all_entries
            if int(entry.tax_ID) in set_tax_id_list
        ]

        # Add normalized folder name attribute
        [entry.make_folder_name(self.date) for entry in download_entries]

        for entry in download_entries:
            # Move to the FTP site branch where files are located
            self.ftp.cwd(entry.ftp_file_path)

            # Set local location for the download
            download_folder = os.path.join(uniprot_dir_path,
                                           entry.download_folder_name)
            try:
                os.mkdir(download_folder)
                os.chdir(download_folder)
            except FileExistsError:
                os.chdir(download_folder)
            except OSError:
                print("OSError")
                print('Download for this entry failed:')
                entry._snoop()
                continue

            # Download reference proteome database(s)
            for file in entry.ftp_download_list:
                # Skip any files that we do not want to download
                if self.banned_file(file):
                    continue

                # Download the file (overwrites any existing files)
                fixed_file = "{}_{}".format(self.date, file)
                self.update_status_bar("Downloading {} file".format(file))
                self.ftp.retrbinary('RETR {}'.format(file),
                                    open('{}'.format(file), 'wb').write)
                print("{} is done downloading".format(file))
                os.rename(os.path.join(download_folder, file),
                          os.path.join(download_folder, fixed_file))

            self.make_fasta_files(uniprot_dir_path, entry)

        messagebox.showinfo("All Downloads Completed!", "Downloads Finished!")
        self.update_status_bar("Done downloading")

if __name__ == '__main__':
    # get path to uniprot databases and call main function to download, etc.
    # check if folder path is passed on command line
    versions = fasta_lib.get_uniprot_version()
    if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
        container = sys.argv[1]
    else:

        # browse to a container folder for uniprot downloads
        # a subfolder will be created with name "uniprot_version"
        database = r'C:\Xcalibur\database'  # default for BioWorks, XP
        if not os.path.exists(database):
            database = os.getcwd()
        container = fasta_lib.get_folder(
            database, 'Select folder for uniprot downloads')
        if container == '': sys.exit()  # cancel button response

    folder = os.path.split(container)[1].lower()  # ignore case
    if folder.startswith('uniprot_'):  # if subfolder, up one level
        container = os.path.split(container)[0]
    folder = os.path.join(container, 'uniprot_' + versions['uniprot'])
    if not os.path.exists(folder):  # make folder if necessary
        os.mkdir(folder)

    # pass in both databases for combined extraction
    main(['sprot', 'trembl'], folder, versions)

# end