return if __name__ == '__main__': # get the path to nr.gz and call main function to download, etc. # check if nr.gz file path is passed on command line if len(sys.argv) > 1 and os.path.exists(sys.argv[1]): selection = sys.argv[1] # if not, browse to a folder for nr downloads else: database = r'C:\Xcalibur\database' if not os.path.exists(database): database = os.getcwd() selection = fasta_lib.get_folder(database, 'Select folder for nr downloads') if selection == '': sys.exit() # cancel button response # if folder name starts with 'nr_', then skip creating a new folder if os.path.split(selection)[1].startswith('nr_'): main('nr', selection) # otherwise create a new folder with date stamp else: curr_time = time.localtime() curr_date = '%4d%02d%02d' % (curr_time[0], curr_time[1], curr_time[2]) folder = os.path.join(selection, 'nr_' + curr_date) if not os.path.exists(folder): os.mkdir(folder) main('nr', folder)
def main(taxon_dict): """Extracts entries by taxon ID from both Sprot and Trembl databases. """ print( '=============================================================================' ) print( ' uniprot_extract_from_both.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=============================================================================' ) # get the UniProt folder and then get the sprot and trembl database names DB = [] default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() uniprot_folder = fasta_lib.get_folder( default, title_string='Select a UniProt download folder') if uniprot_folder == '': sys.exit() # cancel button response version = uniprot_folder.split('_')[-1] uniprot_db = 'uniprot' for files in os.listdir(uniprot_folder): if files.startswith('uniprot_') and files.endswith('.gz'): DB.append(os.path.join(uniprot_folder, files)) if len(DB) != 2: print('WARNING: either sprot or trembl DB was missing') # create a log file to mirror screen output log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile( '\n>>> starting: uniprot_extract_from_both.py', log_obj) # make the smaller uniprot dictionaries (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder) # make the more complete dictionary name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder) # print the list of taxon numbers that will be extracted # NOTE: Any taxon numbers present in analysis text file will not be expanded. taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers if EXPAND_GROUPS: fasta_lib.expand_species(uniprot_folder, 'uniprot', taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT) # inititalize dictionaries and counters taxon_files, taxon_count, name_count = {}, {}, {} for taxon, name in taxon_dict.items(): fname = uniprot_db + '_' + version + '_' + name + '.fasta' fname = os.path.join(uniprot_folder, fname) taxon_files[name] = fname taxon_count[taxon] = 0 name_count[name] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # want to count extracted sequences from each database name_counter = {} number_counter = {} # loop over both databases and extract species duplicates = {} for i in range(len(DB)): prot_read = 0 not_found = 0 for value in taxon_dict.values(): name_counter[value] = 0 for key in taxon_dict.keys(): number_counter[key] = 0 # create a FastaReader object, initialize counters, and start reading uniprot_file = DB[i] x = fasta_lib.FastaReader(uniprot_file) prot = fasta_lib.Protein() for obj in write: print('...reading %s and extracting entries...' % (os.path.split(uniprot_file)[1], ), file=obj) # NOTE: checking for errors will slow program execution, use if needed while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) (spec_id, spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' + prot.description) taxon = sci_to_taxon.get(spec_name, 0) # first choice mapping taxon2 = name_to_taxon.get(spec_name, 0) # alternative mapping if taxon == 0: # first choice not present if taxon2 == 0: not_found += 1 else: taxon = taxon2 # use second choice else: if (taxon != taxon2) and ( taxon2 > 0): # keep track of multiple taxon numbers duplicates[spec_name] = (taxon, taxon2) if taxon_dict.get(taxon, False): if CLEAN_ACCESSIONS: prot.parseUniProt() # taxon number matches, so write the protein to respective output file(s) name = taxon_dict[taxon] name_counter[name] += 1 name_count[name] += 1 taxon_count[taxon] += 1 number_counter[taxon] += 1 f = taxon_files[name] prot.printProtein(f) # print extraction stats for each database for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), os.path.split(DB[0])[1]), file=obj) print('...%s proteins had unknown taxon numbers' % ("{0:,d}".format(not_found), ), file=obj) numbers = list(number_counter.keys()) numbers.sort() if VERBOSE: for j, number in enumerate(numbers): if number_counter[number] > 0: print('......(%s) taxon %s had %s proteins' % (j + 1, number, "{0:,d}".format( number_counter[number])), file=obj) names = list(name_counter.keys()) names.sort() db_name = os.path.split(DB[i])[1] for j, name in enumerate(names): print('......(%s) %s %s proteins extracted' % (j + 1, "{0:,d}".format(name_counter[name]), name), file=obj) # close the extracted database files for f in taxon_files.values(): f.close() # print list of mis-matched taxon number warnings if MISMATCHES: for i, (name, pair) in enumerate(duplicates.items()): for obj in write: print('......(%s) WARNING: %s and %s map to "%s"' % (i + 1, pair[0], pair[1], name), file=obj) # print out the final summary stuff for obj in write: if VERBOSE: print('...combined taxon counts...', file=obj) numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...combined output file counts...', file=obj) for i, name in enumerate(names): print('......(%s) %s total proteins written to %s' % (i + 1, "{0:,d}".format(name_count[name]), uniprot_db + '_' + version + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_both.py', log_obj) log_obj.close() return
species.X_count, file=obj) print(" translations that had Z (ambiguous Q/E):", species.Z_count, file=obj) return # attributes added to species should be there after return # print program name and version print('===================================================================') print(' program check_fasta_dir_walk.py, v1.0.0, Phil Wilmarth, OHSU 2020 ') print('===================================================================') # select a root folder root_path = fasta_lib.get_folder(os.getcwd(), 'Select a Root folder') if not root_path: sys.exit() # cancel button repsonse # create a log file to mirror screen output _folder = root_path log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: check_fasta.py', log_obj) species_list = [] # process the FASTA files for root, dirs, files in os.walk(root_path): for file in files: if file.endswith(".all.fa.gz"): species = Species()
def download_databases(self): """Fetches the database files for the selected species.""" self.login() # Refresh the FTP connection # Throw warning if no databases selected if len(self.tree_right.get_children()) == 0: messagebox.showwarning("Empty Selection", "No databases were selected for download!") return None # Exit function # Get parent folder location for database download self.abs_download_path = fasta_lib.get_folder( self.script_path, 'Select parent folder for database downloads') if not self.abs_download_path: return None # Make a separate folder to contain all files uniprot_dir_name = r"UniProt_{}".format(self.date) uniprot_dir_path = os.path.join(self.abs_download_path, uniprot_dir_name) try: os.mkdir(uniprot_dir_path) except FileExistsError: pass os.chdir(uniprot_dir_path) # Get taxonomy ID numbers for right (download) list tax_id_list = [ self.tree_right.item(entry)['values'][0] for entry in self.tree_right.get_children() ] set_tax_id_list = list(set(tax_id_list)) # remove duplicates (if any) if len(tax_id_list) != len(set_tax_id_list): messagebox.showwarning( "Duplicates found!", "Duplicate databases were selected and will be ignored!") # Get the entry objects for the right taxonomy numbers download_entries = [ entry for entry in self.all_entries if int(entry.tax_ID) in set_tax_id_list ] # Add normalized folder name attribute [entry.make_folder_name(self.date) for entry in download_entries] for entry in download_entries: # Move to the FTP site branch where files are located self.ftp.cwd(entry.ftp_file_path) # Set local location for the download download_folder = os.path.join(uniprot_dir_path, entry.download_folder_name) try: os.mkdir(download_folder) os.chdir(download_folder) except FileExistsError: os.chdir(download_folder) except OSError: print("OSError") print('Download for this entry failed:') entry._snoop() continue # Download reference proteome database(s) for file in entry.ftp_download_list: # Skip any files that we do not want to download if self.banned_file(file): continue # Download the file (overwrites any existing files) fixed_file = "{}_{}".format(self.date, file) self.update_status_bar("Downloading {} file".format(file)) self.ftp.retrbinary('RETR {}'.format(file), open('{}'.format(file), 'wb').write) print("{} is done downloading".format(file)) os.rename(os.path.join(download_folder, file), os.path.join(download_folder, fixed_file)) self.make_fasta_files(uniprot_dir_path, entry) messagebox.showinfo("All Downloads Completed!", "Downloads Finished!") self.update_status_bar("Done downloading")
if __name__ == '__main__': # get path to uniprot databases and call main function to download, etc. # check if folder path is passed on command line versions = fasta_lib.get_uniprot_version() if len(sys.argv) > 1 and os.path.exists(sys.argv[1]): container = sys.argv[1] else: # browse to a container folder for uniprot downloads # a subfolder will be created with name "uniprot_version" database = r'C:\Xcalibur\database' # default for BioWorks, XP if not os.path.exists(database): database = os.getcwd() container = fasta_lib.get_folder( database, 'Select folder for uniprot downloads') if container == '': sys.exit() # cancel button response folder = os.path.split(container)[1].lower() # ignore case if folder.startswith('uniprot_'): # if subfolder, up one level container = os.path.split(container)[0] folder = os.path.join(container, 'uniprot_' + versions['uniprot']) if not os.path.exists(folder): # make folder if necessary os.mkdir(folder) # pass in both databases for combined extraction main(['sprot', 'trembl'], folder, versions) # end