def main(db, folder, versions): """Fetches and analyzes the species names in Sprot database. Arguments: "folder" is full path name to UniProt DBs. "versions" is a dictionary of version numbers for file naming. No return values. Saves a summary text file that can be loaded into EXCEL. """ print( '====================================================================') print( ' sprot_get_analyze.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print( '====================================================================') # create a log file to mirror screen output log_obj = open(os.path.join(folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: sprot_get_analyze.py', log_obj) # make sure the files are present or download if not fasta_lib.download_uniprot(db, folder, versions) # make dictionaries of species names (or accession IDs) to taxon IDs (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(folder) # get more complete list of names to taxon number from ncbi data name_to_taxon = fasta_lib.make_all_names_to_taxon(folder) # make species frequency dictionary fname = 'uniprot_sprot_%s.fasta.gz' % (versions['sprot'], ) (name_freq, name_to_id, prot_count) = fasta_lib.uniprot_species_frequency( os.path.join(folder, fname)) # sort the species names and write to file fasta_lib.save_species_info(db, folder, name_freq, name_to_taxon, sci_to_taxon, id_to_taxon, name_to_id, minimum=min_sequence_count) # print out some stats and exit new_db = 'uniprot_%s_%s.fasta.gz' % ( db, versions[db], ) for obj in write: print('...%s contained %s entries...' % (new_db, prot_count), file=obj) print('...there were', len(name_freq), 'species names...', file=obj) fasta_lib.time_stamp_logfile('>>> ending: sprot_get_analyze.py', log_obj) log_obj.close() return
def fasta_counter(fasta_file): """Counts entries in a FASTA protein database. Call with FASTA filename. Checks for duplicate accessions and (optional) valid characters. """ # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj) # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot = 0 head = 0 conflict = {} # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4 while f.readNextProtein(p, check_for_errs=False): # count protein sequences prot += 1 if (prot % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot), )) ## # check for duplicate accession ## dup = conflict.get(p.accession, False) ## if dup: ## for obj in write: ## print('\n...WARNING: %s is already in FASTA database!\n' % (p.accession,), file=obj) ## if p.molwtProtein(show_errs=False) == conflict[p.accession]: ## print('......possible duplicated sequence...', file=obj) ## else: ## conflict[p.accession] = p.molwtProtein(show_errs=False) # count number of header elements control_A = p.description.count(chr(1)) head = head + control_A + 1 # print results and return for obj in write: print('...there are %s proteins in %s' % ("{0:,d}".format(prot), os.path.split(fasta_file)[1]), file=obj) if head > prot: print('...there were %s header lines' % ("{0:,d}".format(head), ), file=obj) fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj) log_obj.close() return
def main(db, folder): """Fetches and analyzes the species names in the ncbi nr fasta database. Arguments: "nr_folder" is the full path name where "nr.gz" will be. No return values. Saves the main lookup dictionary and a summary text file that can be loaded into EXCEL or a word processor. """ global min_sequence_count print('================================================================') print(' nr_get_analyze.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017') print('================================================================') # create a log file to mirror screen output log_obj = open(os.path.join(folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: nr_get_analyze.py', log_obj) # make sure the files are present or download if not fasta_lib.download_ncbi(folder) # make gi_to_taxon object (or reload from disk) acc_to_taxon = fasta_lib.AccToTaxon(folder) acc_to_taxon.create_or_load(folder) # make a dictionary of taxon IDs to species names taxon_to_name = fasta_lib.make_taxon_to_sci_name(folder) # make the taxon frequency dictionary for the proteins in nr.gz nr_name = os.path.split(folder)[1] + '.gz' for obj in write: print('...processing %s (this takes a few hours...)' % (nr_name, ), file=obj) taxon_freq = {} reftax_freq = {} prot = 0 spec_prot = 0 ref_prot = 0 undef_gi = 0 fasta_file = gzip.open(os.path.join(folder, nr_name), mode='rt') while True: line = fasta_file.readline() if not line: break else: line = line.rstrip() if line.startswith('>'): prot += 1 chunk = 1000000 if (prot % chunk) == 0: print('......(%s proteins read)' % ("{0:,d}".format(prot), )) tax_list = [] reftax_list = [] for header in line[1:].split( chr(1)): # need to remove the leading ">" character acc_ver = header.split()[0] acc = acc_ver.split('.')[0] tax = acc_to_taxon.get(acc, -1) if tax == -1: undef_gi += 1 if tax not in tax_list: spec_prot += 1 tax_list.append(tax) if '_' in acc and tax not in reftax_list: # according to NCBI underscore char only in RefSeq ref_prot += 1 reftax_list.append(tax) for tax in tax_list: fasta_lib.add_or_increment(tax, taxon_freq) for reftax in reftax_list: fasta_lib.add_or_increment(reftax, reftax_freq) # make the name frequency dictionary from the taxon frequency dictionary name_freq = {} for (taxon, freq) in taxon_freq.items(): unknown_name = 'Unknown_taxonID_%s' % (taxon, ) name_freq[taxon_to_name.get(taxon, unknown_name)] = freq # make an inverted name_to_taxon dictionary name_to_taxon = {} for (number, name) in taxon_to_name.items(): name_to_taxon[name] = number # sort the species names and write to file fasta_lib.save_species_info_nr(folder, name_freq, name_to_taxon, reftax_freq, min_sequence_count) # print out some stats and exit for obj in write: if prot > 0: print('...%s contained %s protein entries...' % (os.path.split(folder)[1], "{0:,d}".format(prot)), file=obj) print('...there were %s species-expanded entries...' % ("{0:,d}".format(spec_prot), ), file=obj) print('...%s were RefSeq entries...' % ("{0:,d}".format(ref_prot), ), file=obj) print('...%s entries had undefined taxon ID numbers...' % ("{0:,d}".format(undef_gi), ), file=obj) print('...there were', "{0:,d}".format(len(name_freq)), 'species names...', file=obj) fasta_lib.time_stamp_logfile('>>> ending: nr_get_analyze.py', log_obj) log_obj.close() return
def main(taxon_dict): """Extracts entries by taxon ID from both Sprot and Trembl databases. """ print( '=============================================================================' ) print( ' uniprot_extract_from_both.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=============================================================================' ) # get the UniProt folder and then get the sprot and trembl database names DB = [] default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() uniprot_folder = fasta_lib.get_folder( default, title_string='Select a UniProt download folder') if uniprot_folder == '': sys.exit() # cancel button response version = uniprot_folder.split('_')[-1] uniprot_db = 'uniprot' for files in os.listdir(uniprot_folder): if files.startswith('uniprot_') and files.endswith('.gz'): DB.append(os.path.join(uniprot_folder, files)) if len(DB) != 2: print('WARNING: either sprot or trembl DB was missing') # create a log file to mirror screen output log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile( '\n>>> starting: uniprot_extract_from_both.py', log_obj) # make the smaller uniprot dictionaries (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder) # make the more complete dictionary name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder) # print the list of taxon numbers that will be extracted # NOTE: Any taxon numbers present in analysis text file will not be expanded. taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers if EXPAND_GROUPS: fasta_lib.expand_species(uniprot_folder, 'uniprot', taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT) # inititalize dictionaries and counters taxon_files, taxon_count, name_count = {}, {}, {} for taxon, name in taxon_dict.items(): fname = uniprot_db + '_' + version + '_' + name + '.fasta' fname = os.path.join(uniprot_folder, fname) taxon_files[name] = fname taxon_count[taxon] = 0 name_count[name] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # want to count extracted sequences from each database name_counter = {} number_counter = {} # loop over both databases and extract species duplicates = {} for i in range(len(DB)): prot_read = 0 not_found = 0 for value in taxon_dict.values(): name_counter[value] = 0 for key in taxon_dict.keys(): number_counter[key] = 0 # create a FastaReader object, initialize counters, and start reading uniprot_file = DB[i] x = fasta_lib.FastaReader(uniprot_file) prot = fasta_lib.Protein() for obj in write: print('...reading %s and extracting entries...' % (os.path.split(uniprot_file)[1], ), file=obj) # NOTE: checking for errors will slow program execution, use if needed while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) (spec_id, spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' + prot.description) taxon = sci_to_taxon.get(spec_name, 0) # first choice mapping taxon2 = name_to_taxon.get(spec_name, 0) # alternative mapping if taxon == 0: # first choice not present if taxon2 == 0: not_found += 1 else: taxon = taxon2 # use second choice else: if (taxon != taxon2) and ( taxon2 > 0): # keep track of multiple taxon numbers duplicates[spec_name] = (taxon, taxon2) if taxon_dict.get(taxon, False): if CLEAN_ACCESSIONS: prot.parseUniProt() # taxon number matches, so write the protein to respective output file(s) name = taxon_dict[taxon] name_counter[name] += 1 name_count[name] += 1 taxon_count[taxon] += 1 number_counter[taxon] += 1 f = taxon_files[name] prot.printProtein(f) # print extraction stats for each database for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), os.path.split(DB[0])[1]), file=obj) print('...%s proteins had unknown taxon numbers' % ("{0:,d}".format(not_found), ), file=obj) numbers = list(number_counter.keys()) numbers.sort() if VERBOSE: for j, number in enumerate(numbers): if number_counter[number] > 0: print('......(%s) taxon %s had %s proteins' % (j + 1, number, "{0:,d}".format( number_counter[number])), file=obj) names = list(name_counter.keys()) names.sort() db_name = os.path.split(DB[i])[1] for j, name in enumerate(names): print('......(%s) %s %s proteins extracted' % (j + 1, "{0:,d}".format(name_counter[name]), name), file=obj) # close the extracted database files for f in taxon_files.values(): f.close() # print list of mis-matched taxon number warnings if MISMATCHES: for i, (name, pair) in enumerate(duplicates.items()): for obj in write: print('......(%s) WARNING: %s and %s map to "%s"' % (i + 1, pair[0], pair[1], name), file=obj) # print out the final summary stuff for obj in write: if VERBOSE: print('...combined taxon counts...', file=obj) numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...combined output file counts...', file=obj) for i, name in enumerate(names): print('......(%s) %s total proteins written to %s' % (i + 1, "{0:,d}".format(name_count[name]), uniprot_db + '_' + version + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_both.py', log_obj) log_obj.close() return
# browse to the database database = r"C:\Xcalibur\database" if not os.path.exists(database): database = os.getcwd() file_ext_list = [('FASTA files', '*.fasta'), ('FASTA files', '*.fa'), ('FASTA files', '*.gz')] fasta_files = fasta_lib.get_files(database, file_ext_list, 'Select a FASTA database') if not fasta_files: sys.exit() # cancel button repsonse # create a log file to mirror screen output _folder = os.path.split(fasta_files[0])[0] log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: check_fasta.py', log_obj) # process the FASTA files for fasta_file in fasta_files: try: fasta_checker(fasta_file, write) except FileNotFoundError: pass for obj in write: print(file=obj) # finish up the log file fasta_lib.time_stamp_logfile('>>> ending: check_fasta.py', log_obj) log_obj.close() # end
def main(fasta_file): """Checks entries in a FASTA protein database for identical duplicates. Call with FASTA filename, returns a couple of dictionaries """ print( '====================================================================') print( ' remove_duplicates.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print( '====================================================================') print('\n Analysis results will be written to "duplicates.txt"') # set up the output file names, etc. folder = os.path.split(fasta_file)[0] out_file = os.path.join(folder, 'duplicates.txt') out_obj = open(out_file, 'a') for end in ['.fasta', '.fasta.gz', '.fa.gz']: if fasta_file.endswith(end): nr_database = fasta_file.replace(end, '_nonredun.fasta') if (not nr_database) or (nr_database == fasta_file): nr_database = fasta_file + '_nonredun.fasta' nr_obj = open(nr_database, 'w') write = [None, out_obj] fasta_lib.time_stamp_logfile('\n>>> starting: check_for_duplicates.py', out_obj) # # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot, head, dup = 0, 0, 0 candidates = {} # dictionary of acc:protein lists conflicts = {} # keeps track of seq len and MW # read proteins until EOF while f.readNextProtein(p, check_for_errs=False): prot += 1 control_A = p.description.count(chr(1)) head = head + control_A + 1 dup_data = (p.seqlenProtein(), p.molwtProtein()) duplicate = conflicts.get(dup_data, False) if duplicate: dup += 1 if candidates.get(duplicate, False): candidates[duplicate].append(copy.deepcopy(p)) else: candidates[duplicate] = [copy.deepcopy(p)] else: conflicts[dup_data] = p.accession # get list of proteins to test for identity to_test = {} for key in candidates.keys(): to_test[key] = True # copy proteins to "nr" file, checking for duplicates dup = 0 skip = {} f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() print('Processing:', fasta_file, file=out_obj) # header line to log file while f.readNextProtein(p, check_for_errs=False): if to_test.get(p.accession, False): dup += find_identities(p, candidates, skip, fasta_file, out_obj) if skip.get(p.accession, False): continue p.printProtein(nr_obj) for obj in [None, out_obj]: print('\nThere were', prot, 'total sequences in:', os.path.basename(fasta_file), file=obj) print('There were', dup, 'identical sequences removed\n\n', file=obj) try: obj.close() except AttributeError: pass return
def fasta_add_extras(extra_file, fasta_file, output_file): """Adds contaminants and reverses entries in a FASTA protein database. Called with FASTA filename. Reversed DB written to same location. Options for separate or concatenated output files. """ decoy_string = 'REV_' # the string to denote decoy sequences print('=========================================================================') print(' add_extras_and_reverse.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print('=========================================================================') # open the "forward" and "reversed" output files _file = os.path.splitext(output_file)[0] + '.fasta' for_name = _file.replace('.fasta', '_for.fasta') for_file_obj = open(for_name, 'w') rev_name = _file.replace('.fasta', '_rev.fasta') rev_file_obj = open(rev_name, 'w') # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: add_extras_and_reverse.py', log_obj) # create instances of reader object and protein object # add the extra sequences, accessions changed to "EXTRA_dddd" # NOTE: can only add up to 9999 sequences without code modification... prot = fasta_lib.Protein() pcount = 0 f = fasta_lib.FastaReader(extra_file) # turn on error checking for extra sequences while f.readNextProtein(prot, check_for_errs=True): pcount += 1 # try to clean up original accessions if CLEAN_ACCESSIONS: if prot.accession.startswith('gi|'): prot.parseNCBI(REF_SEQ_ONLY) elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'): prot.parseUniProt(KEEP_UNIPROT_ID) elif prot.accession.startswith('IPI:'): prot.parseIPI(KEEP_IPI_GENE_ID) else: pass # add old accession to description and make new accession # write original sequence to "forward" file and reversed to "reverse" prot.new_desc = '[%s] %s' % (prot.new_acc, prot.new_desc) prot.new_acc = 'EXTRA_%04d' % (pcount,) prot.accession = 'EXTRA_%04d' % (pcount,) prot.printProtein(for_file_obj) rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) for obj in write: print('...there were %s extra sequences in %s' % (pcount, os.path.split(extra_file)[1]), file=obj) # now add the contaminants try: if os.path.exists(CONTAMS): contams_file = CONTAMS else: path = os.path.split(fasta_file)[0] contams_file = os.path.join(path, CONTAMS) f = fasta_lib.FastaReader(contams_file) contams = 0 while f.readNextProtein(prot, check_for_errs=True): pcount += 1 contams += 1 if CLEAN_ACCESSIONS: prot.parseCONT() # write sequences to respective files prot.printProtein(for_file_obj) rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) for obj in write: print('...there were %s contaminant entries in %s' % (contams, contams_file), file=obj) except: for obj in write: print('...WARNING:', CONTAMS, 'not found!', file=obj) # read proteins, clean up accessions, decriptions until EOF # write proteins to "forward" and "reversed" files f = fasta_lib.FastaReader(fasta_file) # checking for errors can slow program execution by factor of 3-4 # Reading and writing sequences will always remove spaces and blank lines while f.readNextProtein(prot, check_for_errs=False): pcount += 1 if CLEAN_ACCESSIONS: if prot.accession.startswith('gi|'): prot.parseNCBI(REF_SEQ_ONLY) elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'): prot.parseUniProt(KEEP_UNIPROT_ID) elif prot.accession.startswith('IPI:'): prot.parseIPI(KEEP_IPI_GENE_ID) else: pass prot.printProtein(for_file_obj) # write to "forward" file rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) # write to "reversed" file # make concatenated output file if desired and print summary stats if MAKE_SEPARATE_BOTH: both_name = _file.replace('.fasta', '_both.fasta') both_file_obj = open(both_name, 'w') for_file_obj.close() for_file_obj = open(for_name, 'r') rev_file_obj.close() rev_file_obj = open(rev_name, 'r') while True: line = for_file_obj.readline() if not line: break both_file_obj.write(str(line)) while True: line = rev_file_obj.readline() if not line: break both_file_obj.write(str(line)) both_file_obj.close() for obj in write: print('...%s total proteins written to %s' % (2*pcount, os.path.split(both_name)[1]), file=obj) if MAKE_SEPARATE_FORWARD: for obj in write: print('...%s proteins written to %s' % (pcount, os.path.split(for_name)[1]), file=obj) if MAKE_SEPARATE_REVERSED: for obj in write: print('...%s proteins reversed and written to %s' % (pcount, os.path.split(rev_name)[1]), file=obj) # close files and delete unwanted files for_file_obj.close() rev_file_obj.close() fasta_lib.time_stamp_logfile('>>> ending: add_extras_and_reverse.py', log_obj) log_obj.close() if not MAKE_SEPARATE_FORWARD: os.remove(for_name) if not MAKE_SEPARATE_REVERSED: os.remove(rev_name) return
def main(string_dict): """Main program to extract entries containing strings from databases. Simple string search of pattern in combined accession/description lines. Logical OR if more than one pattern is mapped to the same outfile. Each matching protein is written once per output file with possible compound header (nr) of all headers containing matching patterns. If "cleaning" of accessions/descriptions is turned on for NCBI nr databases, only the first header element will be retained and any accession number cross-references will be lost. Written by Phil Wilmarth, OHSU, 2009. """ print( '=====================================================================' ) print( ' extract_by_string.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=====================================================================' ) # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() db_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select a FASTSA database') if db_file == '': sys.exit() # cancel button repsonse db_folder, db_name = os.path.split(db_file) base_name = db_name.replace('.gz', '') if not base_name.endswith('.fasta'): base_name = base_name + '.fasta' # create a log file to mirror screen output log_obj = open(os.path.join(db_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: extract_by_string.py', log_obj) # print the list of patterns that will be extracted string_list = list(string_dict.items()) string_list.sort() for obj in write: print('...extracting entries containing these strings:', file=obj) for i, t in enumerate(string_list): print('......(%s) string "%s" to file ending in "%s"' % (i + 1, t[0], t[1]), file=obj) # open the output databases, initialize counters string_files = {} string_count = {} name_count = {} for string, name in string_dict.items(): fname = base_name.replace('.fasta', '_' + name + '.fasta') fname = os.path.join(db_folder, fname) string_files[name] = fname string_count[string] = 0 name_count[name] = 0 for name in string_files.keys(): string_files[name] = open(string_files[name], 'w') # create a FastaReader object, initialize counters, and start reading x = fasta_lib.FastaReader(db_file) prot = fasta_lib.Protein() prot_read = 0 for obj in write: print('...reading %s and extracting entries...' % (db_name, ), file=obj) while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) written = {} # make sure protein is written only ONCE per OUTFILE header = prot.accession + ' ' + prot.description # recreate the '>' line if not CASE_SENSITIVE: # convert to uppercase header = header.upper() for pattern in string_dict.keys(): new_pattern = pattern if not CASE_SENSITIVE: # case insensitive matching new_pattern = new_pattern.upper() for head in header.split(chr(1)): # check each header for matches if new_pattern in head: name = string_dict[pattern] name_header = written.get(name, '') if name_header: name_header = name_header + chr(1) + head written[name] = name_header else: written[name] = head string_count[pattern] += 1 # write any matching proteins to appropriate out file for name in written.keys(): name_count[name] += 1 # output file write counters f = string_files[name] # output file pointers header = written[name] # composite header of name's matches # set the accession and description fields before writing prot.accession = header.split()[0] prot.new_acc = prot.accession prot.description = header[(len(prot.accession) + 1):] prot.new_desc = prot.description if CLEAN_ACCESSIONS: if prot.accession.startswith('gi|'): prot.parseNCBI(REF_SEQ_ONLY) elif prot.accession.startswith( 'sp|') or prot.accession.startswith('tr|'): prot.parseUniProt(KEEP_UNIPROT_ID) prot.printProtein(f) # write any matching proteins # close files for f in string_files.values(): f.close() # print out the summary stuff for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), db_name), file=obj) strings = list(string_count.keys()) strings.sort() for i, string in enumerate(strings): print('......(%s) pattern "%s" was found in %s proteins' % (i + 1, string, "{0:,d}".format(string_count[string])), file=obj) print('...output file summaries...', file=obj) names = list(string_files.keys()) names.sort() for i, name in enumerate(names): temp = base_name.replace('.fasta', '_' + name + '.fasta') print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format(name_count[name]), temp), file=obj) fasta_lib.time_stamp_logfile('>>> ending: extract_by_string.py', log_obj) log_obj.close() return
def main(fasta_file, forward=False, reverse=False, both=True, log_obj=None, contam_path=""): """Adds contaminants and reverses entries for a FASTA protein database. Call with single fasta file name. If "forward", make sequences plus contaminants, if "reverse", make reversed sequences with reversed contaminants, if "both", make concatenated target/decoy with contaminants. "contam_path" is optional fullpath name of a contaminants database to use instead of default """ decoy_string = 'REV_' # the string to denote decoy sequences ###################################### # Change default contaminants file name here: CONTAMS = 'Thermo_contams.fasta' # or pass in a "contams_path" ###################################### # open the "forward" and "reversed" output files if fasta_file.lower().endswith('.gz'): _file = os.path.splitext(fasta_file[:-3])[0] else: _file = os.path.splitext(fasta_file)[0] for_name = _file + '_for.fasta' for_file_obj = open(for_name, 'w') rev_name = _file + '_rev.fasta' rev_file_obj = open(rev_name, 'w') # create the name for the concatenated file (if later needed) both_name = _file + '_both.fasta' # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] if not log_obj: log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: reverse_fasta.py', log_obj) # create instances protein object and initialize counter prot = fasta_lib.Protein() p_read = 0 p_contam = 0 # try to find the contaminants database file # If no contam file path provided, search for it in current directory _file = None if not contam_path: if os.path.exists(CONTAMS): _file = CONTAMS else: path = os.path.split(fasta_file)[0] if os.path.exists(os.path.join(path, CONTAMS)): _file = os.path.join(path, CONTAMS) elif os.path.exists(contam_path) and os.path.isfile(contam_path): _file = contam_path elif os.path.isdir(contam_path) and os.path.exists(os.path.join(contam_path, CONTAMS)): _file = os.path.join(contam_path, CONTAMS) # create reader and add contaminants (if contams file was found) if _file: f = fasta_lib.FastaReader(_file) while f.readNextProtein(prot, check_for_errs=True): p_contam += 1 prot.printProtein(for_file_obj) rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) for obj in write: print('...there were %s contaminant entries in %s' % ("{0:,d}".format(p_contam), os.path.split(_file)[1]), file=obj) else: for obj in write: print('...WARNING: contaminants were not added', file=obj) # read proteins until EOF and write proteins to "forward" and "reversed" files f = fasta_lib.FastaReader(fasta_file) # error checking slows program execution, turn on if needed. # Reading and writing sequences always removes spaces and blank lines. while f.readNextProtein(prot, check_for_errs=False): p_read += 1 prot.printProtein(for_file_obj) # write to "forward" file rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) # write to "reversed" file for_file_obj.close() rev_file_obj.close() for obj in write: print('...%s proteins read from %s' % ("{0:,d}".format(p_read), os.path.split(fasta_file)[1]), file=obj) # make concatenated output file if desired and print summary stats if both: both_file_obj = open(both_name, 'w') for_file_obj = open(for_name, 'r') rev_file_obj = open(rev_name, 'r') while True: line = for_file_obj.readline() if not line: break both_file_obj.write(str(line)) while True: line = rev_file_obj.readline() if not line: break both_file_obj.write(str(line)) both_file_obj.close() for obj in write: print('...%s total proteins written to %s' % ("{0:,d}".format(2*(p_contam+p_read)), os.path.split(both_name)[1]), file=obj) if forward: for obj in write: print('...%s proteins written to %s' % ("{0:,d}".format(p_contam+p_read), os.path.split(for_name)[1]), file=obj) if reverse: for obj in write: print('...%s proteins reversed and written to %s' % ("{0:,d}".format(p_contam+p_read), os.path.split(rev_name)[1]), file=obj) # close files and delete unwanted files for_file_obj.close() rev_file_obj.close() fasta_lib.time_stamp_logfile('>>> ending: reverse_fasta.py', log_obj) log_obj.close() if not forward: os.remove(for_name) if not reverse: os.remove(rev_name) return
def fasta_counter(fasta_file): """Counts entries in a FASTA protein database. Call with FASTA filename. Checks for duplicate accessions and valid aa characters. Computes protein sequence lengths, molecular weights - writes to TXT file (with DB basename) """ # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj) # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot = 0 head = 0 conflict = {} # construct summary file name based on FASTA name if fasta_file.endswith('.fasta.gz'): summary_file = re.sub(r'.fasta.gz$', r'.txt', fasta_file) else: summary_file = re.sub(r'.fasta$', r'.txt', fasta_file) if summary_file == fasta_file: summary_file = fasta_file + '.txt' # open summary file and write header summary_obj = open(summary_file, mode='wt') summary_obj.write('Accession\tLength\tMW\n') # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4 while f.readNextProtein(p, check_for_errs=True): # count protein sequences prot += 1 if (prot % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot), )) # check for duplicate accession dup = conflict.get(p.accession, False) if dup: for obj in write: print('\n...WARNING: %s is already in FASTA database!\n' % (p.accession, ), file=obj) if p.molwtProtein(show_errs=False) == conflict[p.accession]: print('......possible duplicated sequence...', file=obj) else: conflict[p.accession] = p.molwtProtein(show_errs=False) # count number of header elements control_A = p.description.count(chr(1)) head = head + control_A + 1 # add info to summary_file print('\t'.join([ p.accession, str(p.seqlenProtein()), str(round(p.molwtProtein(), 1)) ]), file=summary_obj) # print results and return for obj in write: print('...there are %s proteins in %s' % ("{0:,d}".format(prot), os.path.split(fasta_file)[1]), file=obj) if head > prot: print('...there were %s header lines' % ("{0:,d}".format(head), ), file=obj) fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj) log_obj.close() summary_obj.close() return
def main(taxon_dict): """Main program to extract entries by taxon ID from NCBI nr databases. Each gi number (of each header) is looked up to find associated taxon number for comparison to desired taxon numbers. A separate protein entry will be written for each desired taxon number even if all taxon numbers are written to the same output file. At the protein level, the extracted databases may no longer be non-redundant. If "cleaning" of accessions/descriptions is turned off, all headers matching the desired taxon numbers will be added to the respective protein preserving the usual NCBI nr formatting structure. If cleaning of accessions is turned on during extraction, some information may be lost. This could make subsequent database processing (such as extracting by text string) fail. Cleaning is best done as a last step (i.e. in "reverse_fasta.py"). """ print( '====================================================================') print( ' nr_extract_taxon.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print( '====================================================================') # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() nr_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select an NCBI nr database') if nr_file == '': sys.exit() # cancel button response ncbi_folder, nr_name = os.path.split(nr_file) nr_db = os.path.splitext(nr_name)[0] # create a log file to mirror screen output log_obj = open(os.path.join(ncbi_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: nr_extract_taxon.py', log_obj) # get the saved gi number to taxon number {int:int} dictionary acc_to_taxon = fasta_lib.AccToTaxon(ncbi_folder) acc_to_taxon.create_or_load(ncbi_folder) # print the list of taxon numbers that will be extracted original_dict = taxon_dict taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers. NOTE: if a taxon number appears in # "nr_fasta_analyze.txt", it will not be expanded. Either delete the # line in "nr_fasta_analyze.txt", or make an expanded "taxon_dict" by hand. if EXPAND_GROUPS: fasta_lib.expand_species(ncbi_folder, 'nr', taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT, REF_SEQ_ONLY) # open the output databases, initialize counters, etc. taxon_files = {} taxon_count = {} name_count = {} for taxon, name in taxon_dict.items(): fname = nr_db + '_' + name + '.fasta' fname = os.path.join(ncbi_folder, fname) taxon_files[name] = fname name_count[name] = 0 taxon_count[taxon] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # loop over all proteins in nr x = fasta_lib.FastaReader(nr_file) prot = fasta_lib.Protein() prot_read = 0 not_found = 0 skipped = 0 for obj in write: print('...reading %s and extracting entries...' % (nr_name, ), file=obj) # checking for errors slows down program by about a factor of 3 or 4 while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 1000000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) written = {} line = prot.accession + ' ' + prot.description prot.new_desc = '' # extract the gi numbers for each header for header in line.split(chr(1)): accession_with_version = header.split()[0] accession = accession_with_version.split('.')[0] if REF_SEQ_ONLY and '_' not in accession: continue # skip proteins without RefSeq entries taxon = acc_to_taxon.get(accession, False) # see if taxon number for this gi is in our desired list if taxon: if taxon_dict.get(taxon, False): if written.get(taxon, False): # if taxon number already seen, add to header prot = written[taxon] prot.description = prot.description + chr(1) + header written[taxon] = copy.deepcopy(prot) else: # first time taxon number seen name = taxon_dict[taxon] prot.accession = header.split()[0] prot.description = header[len(prot.accession) + 1:] prot.description = prot.description.rstrip() taxon_count[taxon] += 1 name_count[name] += 1 written[taxon] = copy.deepcopy(prot) else: skipped += 1 else: not_found += 1 continue # write a protein sequence for each taxon number it was matched to for taxon in written.keys(): name = taxon_dict[taxon] f = taxon_files[name] prot = written[taxon] prot.new_desc = prot.description prot.new_acc = prot.accession if CLEAN_ACCESSIONS: prot.parseNCBI(REF_SEQ_ONLY) prot.printProtein(f) # print out number of matches and close files for obj in write: print('...%s proteins in %s' % ("{0:,d}".format(prot_read), nr_name), file=obj) print('...%s accessions did not have known taxon numbers' % ("{0:,d}".format(not_found), ), file=obj) print('...%s accessions were skipped (not in our taxon list)' % ("{0:,d}".format(skipped), ), file=obj) if REF_SEQ_ONLY: print('...Extracted sequences are RefSeq Only!!!', file=obj) if VERBOSE: numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon number %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...output file summaries...', file=obj) names = list(taxon_files.keys()) names.sort() for i, name in enumerate(names): print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format( name_count[name]), nr_db + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: nr_extract_taxon.py', log_obj) log_obj.close() for f in taxon_files.values(): f.close() return
def main(taxon_dict): """Main program to extract entries by taxon ID from uniprot databases. Extraction is from a single downloaded Sprot or Trembl database. """ print( '============================================================================' ) print( ' uniprot_extract_from_one.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '============================================================================' ) # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() uniprot_file = fasta_lib.get_file( default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select an Sprot or Trembl database') if uniprot_file == '': sys.exit() # cancel button repsonse uniprot_folder, uniprot_name = os.path.split(uniprot_file) version = uniprot_name.split('_')[-1] version = version.replace('.fasta.gz', '') uniprot_db = uniprot_name.split('_')[1] # create a log file to mirror screen output log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: uniprot_extract_from_one.py', log_obj) # make the smaller uniprot dictionaries (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder) # make the more complete dictionary name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder) # print the list of taxon numbers that will be extracted taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers # NOTE: Any taxon numbers present in analysis text file will not be expanded. if EXPAND_GROUPS: fasta_lib.expand_species(uniprot_folder, uniprot_db, taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT) # inititalize dictionaries and counters taxon_files, taxon_count, name_count = {}, {}, {} for taxon, name in taxon_dict.items(): fname = uniprot_db + '_' + version + '_' + name + '.fasta' fname = os.path.join(uniprot_folder, fname) taxon_files[name] = fname taxon_count[taxon] = 0 name_count[name] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # create a FastaReader object, initialize counters, and start reading x = fasta_lib.FastaReader(uniprot_file) prot = fasta_lib.Protein() prot_read = 0 not_found = 0 duplicates = {} for obj in write: print('...reading %s and extracting entries...' % (uniprot_name, ), file=obj) # checking for errors in sequences slows program execution, use as needed while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) (spec_id, spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' + prot.description) taxon = sci_to_taxon.get(spec_name, 0) # first choice mapping taxon2 = name_to_taxon.get(spec_name, 0) # alternative mapping if taxon == 0: # first choice not present if taxon2 == 0: not_found += 1 else: taxon = taxon2 # use second choice else: if (taxon != taxon2) and ( taxon2 > 0): #keep track of multiple taxon numbers duplicates[spec_name] = (taxon, taxon2) if taxon_dict.get(taxon, False): if CLEAN_ACCESSIONS: prot.parseUniProt() # taxon number matches, so write the protein to the respective file name = taxon_dict[taxon] name_count[name] += 1 taxon_count[taxon] += 1 f = taxon_files[name] prot.printProtein(f) # close the extracted database files for f in taxon_files.values(): f.close() # print list of mis-matching taxon number warnings if MISMATCHES: for i, (name, pair) in enumerate(duplicates.items()): for obj in write: print('......(%s) WARNING: %s and %s map to "%s"' % (i + 1, pair[0], pair[1], name), file=obj) # print out the summary stuff for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), uniprot_name), file=obj) print('...%s proteins had unknown taxon numbers' % (not_found, ), file=obj) if VERBOSE: numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...output file summaries...', file=obj) names = list(taxon_files.keys()) names.sort() for i, name in enumerate(names): print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format(name_count[name]), uniprot_db + '_' + version + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_one.py', log_obj) log_obj.close() return