def fasta_counter(fasta_file): """Counts entries in a FASTA protein database. Call with FASTA filename. Checks for duplicate accessions and (optional) valid characters. """ # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj) # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot = 0 head = 0 conflict = {} # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4 while f.readNextProtein(p, check_for_errs=False): # count protein sequences prot += 1 if (prot % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot), )) ## # check for duplicate accession ## dup = conflict.get(p.accession, False) ## if dup: ## for obj in write: ## print('\n...WARNING: %s is already in FASTA database!\n' % (p.accession,), file=obj) ## if p.molwtProtein(show_errs=False) == conflict[p.accession]: ## print('......possible duplicated sequence...', file=obj) ## else: ## conflict[p.accession] = p.molwtProtein(show_errs=False) # count number of header elements control_A = p.description.count(chr(1)) head = head + control_A + 1 # print results and return for obj in write: print('...there are %s proteins in %s' % ("{0:,d}".format(prot), os.path.split(fasta_file)[1]), file=obj) if head > prot: print('...there were %s header lines' % ("{0:,d}".format(head), ), file=obj) fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj) log_obj.close() return
# browse to the database database = r"C:\Xcalibur\database" if not os.path.exists(database): database = os.getcwd() fasta_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta')], 'Select a TriTryp FASTA database') if fasta_file == '': sys.exit() # cancel button repsonse # build new database name new_fasta_file = os.path.basename(fasta_file) new_fasta_file = new_fasta_file.replace('.fasta', '_fixed.fasta') new_fasta_file = os.path.join(os.path.dirname(fasta_file), new_fasta_file) # initializations proteins = [] p = fasta_lib.Protein() pcount = 0 stop_count = 0 gap_count = 0 no_met = 0 # read the sequences into a list f = fasta_lib.FastaReader(fasta_file) while f.readNextProtein(p, check_for_errs=True): pcount += 1 # parse the description string into a dictionary try: items = [x.strip() for x in p.description.split('|') if x] header_dict = {x.split('=')[0]: x.split('=')[1] for x in items} new_desc = []
def main(taxon_dict): """Extracts entries by taxon ID from both Sprot and Trembl databases. """ print( '=============================================================================' ) print( ' uniprot_extract_from_both.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=============================================================================' ) # get the UniProt folder and then get the sprot and trembl database names DB = [] default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() uniprot_folder = fasta_lib.get_folder( default, title_string='Select a UniProt download folder') if uniprot_folder == '': sys.exit() # cancel button response version = uniprot_folder.split('_')[-1] uniprot_db = 'uniprot' for files in os.listdir(uniprot_folder): if files.startswith('uniprot_') and files.endswith('.gz'): DB.append(os.path.join(uniprot_folder, files)) if len(DB) != 2: print('WARNING: either sprot or trembl DB was missing') # create a log file to mirror screen output log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile( '\n>>> starting: uniprot_extract_from_both.py', log_obj) # make the smaller uniprot dictionaries (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder) # make the more complete dictionary name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder) # print the list of taxon numbers that will be extracted # NOTE: Any taxon numbers present in analysis text file will not be expanded. taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers if EXPAND_GROUPS: fasta_lib.expand_species(uniprot_folder, 'uniprot', taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT) # inititalize dictionaries and counters taxon_files, taxon_count, name_count = {}, {}, {} for taxon, name in taxon_dict.items(): fname = uniprot_db + '_' + version + '_' + name + '.fasta' fname = os.path.join(uniprot_folder, fname) taxon_files[name] = fname taxon_count[taxon] = 0 name_count[name] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # want to count extracted sequences from each database name_counter = {} number_counter = {} # loop over both databases and extract species duplicates = {} for i in range(len(DB)): prot_read = 0 not_found = 0 for value in taxon_dict.values(): name_counter[value] = 0 for key in taxon_dict.keys(): number_counter[key] = 0 # create a FastaReader object, initialize counters, and start reading uniprot_file = DB[i] x = fasta_lib.FastaReader(uniprot_file) prot = fasta_lib.Protein() for obj in write: print('...reading %s and extracting entries...' % (os.path.split(uniprot_file)[1], ), file=obj) # NOTE: checking for errors will slow program execution, use if needed while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) (spec_id, spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' + prot.description) taxon = sci_to_taxon.get(spec_name, 0) # first choice mapping taxon2 = name_to_taxon.get(spec_name, 0) # alternative mapping if taxon == 0: # first choice not present if taxon2 == 0: not_found += 1 else: taxon = taxon2 # use second choice else: if (taxon != taxon2) and ( taxon2 > 0): # keep track of multiple taxon numbers duplicates[spec_name] = (taxon, taxon2) if taxon_dict.get(taxon, False): if CLEAN_ACCESSIONS: prot.parseUniProt() # taxon number matches, so write the protein to respective output file(s) name = taxon_dict[taxon] name_counter[name] += 1 name_count[name] += 1 taxon_count[taxon] += 1 number_counter[taxon] += 1 f = taxon_files[name] prot.printProtein(f) # print extraction stats for each database for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), os.path.split(DB[0])[1]), file=obj) print('...%s proteins had unknown taxon numbers' % ("{0:,d}".format(not_found), ), file=obj) numbers = list(number_counter.keys()) numbers.sort() if VERBOSE: for j, number in enumerate(numbers): if number_counter[number] > 0: print('......(%s) taxon %s had %s proteins' % (j + 1, number, "{0:,d}".format( number_counter[number])), file=obj) names = list(name_counter.keys()) names.sort() db_name = os.path.split(DB[i])[1] for j, name in enumerate(names): print('......(%s) %s %s proteins extracted' % (j + 1, "{0:,d}".format(name_counter[name]), name), file=obj) # close the extracted database files for f in taxon_files.values(): f.close() # print list of mis-matched taxon number warnings if MISMATCHES: for i, (name, pair) in enumerate(duplicates.items()): for obj in write: print('......(%s) WARNING: %s and %s map to "%s"' % (i + 1, pair[0], pair[1], name), file=obj) # print out the final summary stuff for obj in write: if VERBOSE: print('...combined taxon counts...', file=obj) numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...combined output file counts...', file=obj) for i, name in enumerate(names): print('......(%s) %s total proteins written to %s' % (i + 1, "{0:,d}".format(name_count[name]), uniprot_db + '_' + version + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_both.py', log_obj) log_obj.close() return
def make_fasta_files(self, uniprot_dir_path, entry): """Uncompresses canonical FASTA file and does some analysis. Also combines fasta and additional fasta files with decompression. """ # Get the list of protein fasta files temp_files = [ "{}_{}".format(self.date, x) for x in entry.ftp_download_list if 'fasta' in x.lower() ] fasta_files = [] combined_files = [] for f in temp_files: if not self.banned_file(f): fasta_files.append(f) fasta_files.sort() fasta_file = fasta_files[0].replace('.fasta.gz', '') fasta_file = fasta_file + '_' + entry.short_name + '_canonical.fasta' combined_files.append(fasta_file) fasta_obj_list = [ open(os.path.join(uniprot_dir_path, fasta_file), 'w') ] if len(fasta_files) == 2: fasta_file = fasta_files[1].replace('_additional.fasta.gz', '') fasta_file = fasta_file + '_' + entry.short_name + '_all.fasta' fasta_obj_list.append( open(os.path.join(uniprot_dir_path, fasta_file), 'w')) combined_files.append(fasta_file) # Set up to read the fasta file entries and init counters print('proteome:', entry.proteome_ID, 'species:', entry.species_name) p = fasta_lib.Protein() # Read entries and write to new file for i, fasta in enumerate(fasta_files): sp_count = 0 iso_count = 0 tr_count = 0 p_count = 0 f = fasta_lib.FastaReader( os.path.join(uniprot_dir_path, entry.download_folder_name, fasta)) while f.readNextProtein(p, False): p_count += 1 if p.accession.startswith('sp|'): sp_count += 1 if p.accession.startswith('tr|'): tr_count += 1 if ('-' in p.accession) or ('Isoform of' in p.description): iso_count += 1 if i == 0: for obj in fasta_obj_list: p.printProtein(obj) else: p.printProtein(fasta_obj_list[i]) # Print stats print('...database:', fasta) print( '......tot_count: %s, sp count: %s, tr count: %s, isoform count: %s' % ("{0:,}".format(p_count), "{0:,}".format(sp_count), "{0:,}".format(tr_count), "{0:,}".format(iso_count))) # Close output file(s) for obj in fasta_obj_list: obj.close() # chdir into correct folder and make sure all file paths are set up correctly uniprot_dir_name = r"UniProt_{}".format(self.date) os.chdir(os.path.join(self.abs_download_path, uniprot_dir_name)) # Add forward/reverse/contams for file in combined_files: self.database_processing(file, self.contams_database)
def main(fasta_file): """Checks entries in a FASTA protein database for identical duplicates. Call with FASTA filename, returns a couple of dictionaries """ print( '=======================================================================' ) print( ' check_for_duplicates.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=======================================================================' ) # set up the output file names, etc. folder = os.path.split(fasta_file)[0] out_file = os.path.join(folder, 'duplicates.txt') out_obj = open(out_file, 'w') # create instances of reader object and protein object, initialize counter f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot = 0 head = 0 dup = 0 conflict = {} # read proteins until EOF while f.readNextProtein(p, check_for_errs=False): prot += 1 control_A = p.description.count(chr(1)) head = head + control_A + 1 dup_data = (p.seqlenProtein(), p.molwtProtein()) value = p.accession duplicate = conflict.get(dup_data, False) if duplicate: dup += 1 print('...WARNING: (protein no. %s) %s may be same as %s' % (prot, p.accession, duplicate), file=out_obj) else: conflict[dup_data] = value # print result and return print('...there are %s proteins in %s' % (prot, os.path.split(fasta_file)[1])) if head > prot: print('...there were %s header lines...' % (head, )) print('...there were %s possible duplicates...' % (dup, )) out_obj.close() # rewind out file and build new dictionaries out_obj = open(out_file, 'r') conflict = {} to_save = {} while True: line = out_obj.readline() if not line: break else: line = line.split() conflict[line[4]] = line[9] to_save[line[4]] = True to_save[line[9]] = True out_obj.close() # read in and save the proteins that might be duplicates of each other candidates = [] i = 0 dup2 = 0 index = {} f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() while f.readNextProtein(p, check_for_errs=False): if to_save.get(p.accession, False): candidates.append(copy.deepcopy(p)) index[p.accession] = i i += 1 if len(candidates) == 0: print('to_save_dictionary:', to_save) print('bailing out in middle') return (candidates, index) # look deeper to see if candidates are actually duplicates out_obj = open(out_file, 'a') print('\n========================================\n', file=out_obj) exact_dup = 0 accessions = list(conflict.keys()) accessions.sort() for acc in accessions: p_dup = candidates[index[acc]] dup_acc = conflict[acc] p_ref = candidates[index[dup_acc]] if p_ref.sequence == p_dup.sequence: exact_dup += 1 print('...(%s) WARNING: %s exact match to %s' % (exact_dup, p_ref.accession, p_dup.accession), file=out_obj) print('...number of exact matches was', exact_dup) return (candidates, index)
def main(fasta_file): """Checks entries in a FASTA protein database for identical duplicates. Call with FASTA filename, returns a couple of dictionaries """ print( '====================================================================') print( ' remove_duplicates.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print( '====================================================================') print('\n Analysis results will be written to "duplicates.txt"') # set up the output file names, etc. folder = os.path.split(fasta_file)[0] out_file = os.path.join(folder, 'duplicates.txt') out_obj = open(out_file, 'a') for end in ['.fasta', '.fasta.gz', '.fa.gz']: if fasta_file.endswith(end): nr_database = fasta_file.replace(end, '_nonredun.fasta') if (not nr_database) or (nr_database == fasta_file): nr_database = fasta_file + '_nonredun.fasta' nr_obj = open(nr_database, 'w') write = [None, out_obj] fasta_lib.time_stamp_logfile('\n>>> starting: check_for_duplicates.py', out_obj) # # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot, head, dup = 0, 0, 0 candidates = {} # dictionary of acc:protein lists conflicts = {} # keeps track of seq len and MW # read proteins until EOF while f.readNextProtein(p, check_for_errs=False): prot += 1 control_A = p.description.count(chr(1)) head = head + control_A + 1 dup_data = (p.seqlenProtein(), p.molwtProtein()) duplicate = conflicts.get(dup_data, False) if duplicate: dup += 1 if candidates.get(duplicate, False): candidates[duplicate].append(copy.deepcopy(p)) else: candidates[duplicate] = [copy.deepcopy(p)] else: conflicts[dup_data] = p.accession # get list of proteins to test for identity to_test = {} for key in candidates.keys(): to_test[key] = True # copy proteins to "nr" file, checking for duplicates dup = 0 skip = {} f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() print('Processing:', fasta_file, file=out_obj) # header line to log file while f.readNextProtein(p, check_for_errs=False): if to_test.get(p.accession, False): dup += find_identities(p, candidates, skip, fasta_file, out_obj) if skip.get(p.accession, False): continue p.printProtein(nr_obj) for obj in [None, out_obj]: print('\nThere were', prot, 'total sequences in:', os.path.basename(fasta_file), file=obj) print('There were', dup, 'identical sequences removed\n\n', file=obj) try: obj.close() except AttributeError: pass return
def fasta_checker(fasta_file, write): """Checks FASTA files for non-standard amino acid characters. """ for obj in write: print(" database:", os.path.basename(fasta_file), file=obj) # initializations proteins = [] p = fasta_lib.Protein() # counters prot_count = 0 no_start_met = 0 stop_count = 0 stop_end = 0 gap_count = 0 B_count = 0 J_count = 0 O_count = 0 U_count = 0 X_count = 0 Z_count = 0 # read the sequences into a list f = fasta_lib.FastaReader(fasta_file) while f.readNextProtein(p, check_for_errs=True): prot_count += 1 # test for odd amino acids, stop codons, gaps if not p.sequence.startswith('M'): no_start_met += 1 if p.sequence.endswith('*'): stop_end += 1 if '*' in p.sequence: stop_count += 1 if '-' in p.sequence: gap_count += 1 if 'B' in p.sequence: B_count += 1 if 'J' in p.sequence: J_count += 1 if 'O' in p.sequence: O_count += 1 if 'U' in p.sequence: U_count += 1 if 'X' in p.sequence: X_count += 1 if 'Z' in p.sequence: Z_count += 1 # save the protein in list proteins.append(copy.deepcopy(p)) # check for duplicates and count duplicate_count = 0 mw_dict = {} for i, p in enumerate(proteins): if mw_dict.get(str(p.molwtProtein()), False): j = mw_dict[str(p.molwtProtein())] if p.sequence == proteins[j].sequence: duplicate_count += 1 else: mw_dict[str(p.molwtProtein())] = i # print out the report of oddball characters for obj in write: print(" total number of input sequences was:", prot_count, file=obj) print(" number of redundant sequences was:", duplicate_count, file=obj) print(" translations that do not start with Met:", no_start_met, file=obj) print(" translations that ended with a stop codon:", stop_end, file=obj) print(" translations that had premature stop codons:", stop_count, file=obj) print(" translations that contained gaps:", gap_count, file=obj) print(" translations that had B (ambiguous N/D):", B_count, file=obj) print(" translations that had J (ambiguous I/L):", J_count, file=obj) print(" translations that had O (pyrrolysine):", O_count, file=obj) print(" translations that had U (selenocysteine):", U_count, file=obj) print(" translations that had X (unknown amino acid):", X_count, file=obj) print(" translations that had Z (ambiguous Q/E):", Z_count, file=obj) return
def fasta_add_extras(extra_file, fasta_file, output_file): """Adds contaminants and reverses entries in a FASTA protein database. Called with FASTA filename. Reversed DB written to same location. Options for separate or concatenated output files. """ decoy_string = 'REV_' # the string to denote decoy sequences print('=========================================================================') print(' add_extras_and_reverse.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print('=========================================================================') # open the "forward" and "reversed" output files _file = os.path.splitext(output_file)[0] + '.fasta' for_name = _file.replace('.fasta', '_for.fasta') for_file_obj = open(for_name, 'w') rev_name = _file.replace('.fasta', '_rev.fasta') rev_file_obj = open(rev_name, 'w') # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: add_extras_and_reverse.py', log_obj) # create instances of reader object and protein object # add the extra sequences, accessions changed to "EXTRA_dddd" # NOTE: can only add up to 9999 sequences without code modification... prot = fasta_lib.Protein() pcount = 0 f = fasta_lib.FastaReader(extra_file) # turn on error checking for extra sequences while f.readNextProtein(prot, check_for_errs=True): pcount += 1 # try to clean up original accessions if CLEAN_ACCESSIONS: if prot.accession.startswith('gi|'): prot.parseNCBI(REF_SEQ_ONLY) elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'): prot.parseUniProt(KEEP_UNIPROT_ID) elif prot.accession.startswith('IPI:'): prot.parseIPI(KEEP_IPI_GENE_ID) else: pass # add old accession to description and make new accession # write original sequence to "forward" file and reversed to "reverse" prot.new_desc = '[%s] %s' % (prot.new_acc, prot.new_desc) prot.new_acc = 'EXTRA_%04d' % (pcount,) prot.accession = 'EXTRA_%04d' % (pcount,) prot.printProtein(for_file_obj) rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) for obj in write: print('...there were %s extra sequences in %s' % (pcount, os.path.split(extra_file)[1]), file=obj) # now add the contaminants try: if os.path.exists(CONTAMS): contams_file = CONTAMS else: path = os.path.split(fasta_file)[0] contams_file = os.path.join(path, CONTAMS) f = fasta_lib.FastaReader(contams_file) contams = 0 while f.readNextProtein(prot, check_for_errs=True): pcount += 1 contams += 1 if CLEAN_ACCESSIONS: prot.parseCONT() # write sequences to respective files prot.printProtein(for_file_obj) rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) for obj in write: print('...there were %s contaminant entries in %s' % (contams, contams_file), file=obj) except: for obj in write: print('...WARNING:', CONTAMS, 'not found!', file=obj) # read proteins, clean up accessions, decriptions until EOF # write proteins to "forward" and "reversed" files f = fasta_lib.FastaReader(fasta_file) # checking for errors can slow program execution by factor of 3-4 # Reading and writing sequences will always remove spaces and blank lines while f.readNextProtein(prot, check_for_errs=False): pcount += 1 if CLEAN_ACCESSIONS: if prot.accession.startswith('gi|'): prot.parseNCBI(REF_SEQ_ONLY) elif prot.accession.startswith('sp|') or prot.accession.startswith('tr|'): prot.parseUniProt(KEEP_UNIPROT_ID) elif prot.accession.startswith('IPI:'): prot.parseIPI(KEEP_IPI_GENE_ID) else: pass prot.printProtein(for_file_obj) # write to "forward" file rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) # write to "reversed" file # make concatenated output file if desired and print summary stats if MAKE_SEPARATE_BOTH: both_name = _file.replace('.fasta', '_both.fasta') both_file_obj = open(both_name, 'w') for_file_obj.close() for_file_obj = open(for_name, 'r') rev_file_obj.close() rev_file_obj = open(rev_name, 'r') while True: line = for_file_obj.readline() if not line: break both_file_obj.write(str(line)) while True: line = rev_file_obj.readline() if not line: break both_file_obj.write(str(line)) both_file_obj.close() for obj in write: print('...%s total proteins written to %s' % (2*pcount, os.path.split(both_name)[1]), file=obj) if MAKE_SEPARATE_FORWARD: for obj in write: print('...%s proteins written to %s' % (pcount, os.path.split(for_name)[1]), file=obj) if MAKE_SEPARATE_REVERSED: for obj in write: print('...%s proteins reversed and written to %s' % (pcount, os.path.split(rev_name)[1]), file=obj) # close files and delete unwanted files for_file_obj.close() rev_file_obj.close() fasta_lib.time_stamp_logfile('>>> ending: add_extras_and_reverse.py', log_obj) log_obj.close() if not MAKE_SEPARATE_FORWARD: os.remove(for_name) if not MAKE_SEPARATE_REVERSED: os.remove(rev_name) return
def main(fasta_file, up_one=False): """Processes one Ensembl fasta file - reformats description lines, checks things. up_one determines where the new file is written. """ # create the new database name original_fasta_file = os.path.basename(fasta_file) new_fasta_file = original_fasta_file.replace('.fasta', '_fixed.fasta') if new_fasta_file == original_fasta_file: new_fasta_file = original_fasta_file.replace('.fa', '_fixed.fasta') if new_fasta_file == original_fasta_file: print('WARNING! creating new file name failed') print('...make sure database is not compressed') return False if new_fasta_file.endswith('.gz'): new_fasta_file = new_fasta_file[:-3] if up_one: folder_name = os.path.dirname(os.path.dirname(fasta_file)) else: folder_name = os.path.dirname(fasta_file) new_fasta_file = os.path.join(folder_name, new_fasta_file) # initializations proteins = [] accessions = {} p = fasta_lib.Protein() pcount = 0 # sequence count dup_count = 0 # duplicate accession count stop_count = 0 # "*" gap_count = 0 # "-" no_met = 0 # does not start with M X_count = 0 # unknow AA B_count = 0 # N or D Z_count = 0 # Q or E J_count = 0 # I or L U_count = 0 # selenocysteine # set up the list of possible tags in header lines # this should probably be generalized somehow... all_tags = ['pep:', 'pep scaffold:', 'pep genescaffold:', 'pep chromosome:', 'pep contig:', 'pep reftig:', 'pep supercontig:', 'pep ultracontig:', 'pep group:', 'gene:', 'transcript:', 'gene_biotype:', 'transcript_biotype:', 'gene_symbol:', 'description:'] # read the sequences into a list f = fasta_lib.FastaReader(fasta_file) while f.readNextProtein(p, check_for_errs=True): pcount += 1 # check if accession already seen if p.accession in accessions: dup_count += 1 accessions[p.accession] += 1 print('...WARNING: skipping duplicate accession:', p.accession) continue else: accessions[p.accession] = 1 # clean up the description string p.new_desc = parse_ensembl_header_line(p.description, all_tags) # test for odd amino acids, stop codons, gaps if not p.sequence.startswith('M'): no_met += 1 p.new_desc = p.new_desc + ' (No starting Met)' if '*' in p.sequence: stop_count += 1 cut = p.sequence.index('*') string = ' (Premature stop %s/%s)' % (cut, len(p.sequence)) p.new_desc = p.new_desc + string p.sequence = p.sequence[:cut] if '-' in p.sequence: gap_count += 1 p.new_desc = p.new_desc + ' (has gaps)' if 'B' in p.sequence: B_count += 1 p.new_desc = p.new_desc + ' (has B)' if 'Z' in p.sequence: Z_count += 1 p.new_desc = p.new_desc + ' (has Z)' if 'J' in p.sequence: J_count += 1 p.new_desc = p.new_desc + ' (has J)' if 'U' in p.sequence: U_count += 1 p.new_desc = p.new_desc + ' (has U)' if 'X' in p.sequence: X_count += 1 p.new_desc = p.new_desc + ' (has unknown X)' # save the protein in list proteins.append(copy.deepcopy(p)) # open the new protein fasta file and write out the proteins fixcount = 0 file_obj = open(new_fasta_file, 'w') for p in proteins: if len(p.sequence) > 0: p.printProtein(file_obj) else: print(' empty sequence (stop codon at start):', p.accession) fixcount += 1 file_obj.close() # print(out the report of oddball characters print(" Ensembl database:", os.path.basename(fasta_file)) print(" translations that do not start with Met:", no_met) print(" translations that have premature stop codons:", stop_count) print(" translations that contain gaps:", gap_count) print(" translations that contain X (unknowns):", X_count) print(" translations that contain B:", B_count) print(" translations that contain Z:", Z_count) print(" translations that contain J:", J_count) print(" translations that contain U:", U_count) print(" total number of input sequences was:", pcount) print(" total number of sequences written was:", fixcount) print(" number of duplicate accessions was:", dup_count) return new_fasta_file
def fasta_digester(fasta_file, enzyme='trypsin', log=[None]): """Trypsin digests entries in a FASTA protein database. Call with FASTA filename, returns list of proteins with theoretical tryptic digest peptide lists Checks for duplicate accessions and (optional) valid characters. """ print('==================================================================') print(' fasta_digester.py, v 1.1.3, written by Phil Wilmarth, OHSU, 2017 ') print('==================================================================') # compile the regex for desired digestion if enzyme.upper() == 'No_enzyme'.upper(): regex = re.compile(r".") elif enzyme.upper() == 'trypsin'.upper(): # checked regex = re.compile(r".(?:(?<![KR](?!P)).)*") elif enzyme.upper() == 'trypsin-P'.upper(): # checked regex = re.compile(r".(?:(?<![KR]).)*") elif enzyme.upper() == 'Lys-C'.upper(): # checked regex = re.compile(r".(?:(?<![K](?!P)).)*") elif enzyme.upper() == 'Lys-C-P'.upper(): # checked regex = re.compile(r".(?:(?<![K]).)*") elif enzyme.upper() == 'Lys-N'.upper(): # checked regex = re.compile(r".(?:(?![K]).)*") elif enzyme.upper() == 'Arg-C'.upper(): # checked regex = re.compile(r".(?:(?<![R](?!P)).)*") elif enzyme.upper() == 'Asp-N'.upper(): # checked regex = re.compile(r".(?:(?![D]).)*") elif enzyme.upper() == 'CNBr'.upper(): # checked regex = re.compile(r".(?:(?<![M]).)*") elif enzyme.upper() == 'Glu-C'.upper(): # checked regex = re.compile(r".(?:(?<![DE](?!P)).)*") elif enzyme.upper() == 'PepsinA'.upper(): # checked regex = re.compile(r".(?:(?<![FL](?!P)).)*") elif enzyme.upper() == 'chymotrypsin'.upper(): # checked regex = re.compile(r".(?:(?<![FWYL](?!P)).)*") else: print('...WARNING: Enzyme:', enzyme, 'not recognized') regex = None # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot = 0 head = 0 all_peptides = {} print('starting file reading:', time.ctime()) # read proteins until EOF; NOTE: checking for errors slows program by factor of 3 or 4 while f.readNextProtein(p, check_for_errs=False): # digest protein sequence (regex expression, low mass cutoff, high mas cutoff, # minimum peptide length, maximum number of missed cleavages, type of masses) p.enzymaticDigest(regex, 500.0, 5000.0, 7, 2, 'mono') for pep in p.peptides: # mask I and L residues mass_spec_seq = re.sub(r'[IL]', 'j', pep.seq) # make dictionary of sequences and counts if all_peptides.get(mass_spec_seq): all_peptides[mass_spec_seq] += 1 else: all_peptides[mass_spec_seq] = 1 # count protein sequences prot += 1 if (prot % 500000) == 0: print('......(%s proteins read...)' % (prot, )) # count number of header elements control_A = p.description.count(chr(1)) head = head + control_A + 1 # print number of proteins/headers and return peptide dictionary for obj in log: print('There are %s proteins in %s' % ("{0:,d}".format(prot), os.path.basename(fasta_file)), file=obj) if head > prot: print('There were %s header lines' % (head, ), file=obj) return all_peptides
def main(string_dict): """Main program to extract entries containing strings from databases. Simple string search of pattern in combined accession/description lines. Logical OR if more than one pattern is mapped to the same outfile. Each matching protein is written once per output file with possible compound header (nr) of all headers containing matching patterns. If "cleaning" of accessions/descriptions is turned on for NCBI nr databases, only the first header element will be retained and any accession number cross-references will be lost. Written by Phil Wilmarth, OHSU, 2009. """ print( '=====================================================================' ) print( ' extract_by_string.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=====================================================================' ) # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() db_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select a FASTSA database') if db_file == '': sys.exit() # cancel button repsonse db_folder, db_name = os.path.split(db_file) base_name = db_name.replace('.gz', '') if not base_name.endswith('.fasta'): base_name = base_name + '.fasta' # create a log file to mirror screen output log_obj = open(os.path.join(db_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: extract_by_string.py', log_obj) # print the list of patterns that will be extracted string_list = list(string_dict.items()) string_list.sort() for obj in write: print('...extracting entries containing these strings:', file=obj) for i, t in enumerate(string_list): print('......(%s) string "%s" to file ending in "%s"' % (i + 1, t[0], t[1]), file=obj) # open the output databases, initialize counters string_files = {} string_count = {} name_count = {} for string, name in string_dict.items(): fname = base_name.replace('.fasta', '_' + name + '.fasta') fname = os.path.join(db_folder, fname) string_files[name] = fname string_count[string] = 0 name_count[name] = 0 for name in string_files.keys(): string_files[name] = open(string_files[name], 'w') # create a FastaReader object, initialize counters, and start reading x = fasta_lib.FastaReader(db_file) prot = fasta_lib.Protein() prot_read = 0 for obj in write: print('...reading %s and extracting entries...' % (db_name, ), file=obj) while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) written = {} # make sure protein is written only ONCE per OUTFILE header = prot.accession + ' ' + prot.description # recreate the '>' line if not CASE_SENSITIVE: # convert to uppercase header = header.upper() for pattern in string_dict.keys(): new_pattern = pattern if not CASE_SENSITIVE: # case insensitive matching new_pattern = new_pattern.upper() for head in header.split(chr(1)): # check each header for matches if new_pattern in head: name = string_dict[pattern] name_header = written.get(name, '') if name_header: name_header = name_header + chr(1) + head written[name] = name_header else: written[name] = head string_count[pattern] += 1 # write any matching proteins to appropriate out file for name in written.keys(): name_count[name] += 1 # output file write counters f = string_files[name] # output file pointers header = written[name] # composite header of name's matches # set the accession and description fields before writing prot.accession = header.split()[0] prot.new_acc = prot.accession prot.description = header[(len(prot.accession) + 1):] prot.new_desc = prot.description if CLEAN_ACCESSIONS: if prot.accession.startswith('gi|'): prot.parseNCBI(REF_SEQ_ONLY) elif prot.accession.startswith( 'sp|') or prot.accession.startswith('tr|'): prot.parseUniProt(KEEP_UNIPROT_ID) prot.printProtein(f) # write any matching proteins # close files for f in string_files.values(): f.close() # print out the summary stuff for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), db_name), file=obj) strings = list(string_count.keys()) strings.sort() for i, string in enumerate(strings): print('......(%s) pattern "%s" was found in %s proteins' % (i + 1, string, "{0:,d}".format(string_count[string])), file=obj) print('...output file summaries...', file=obj) names = list(string_files.keys()) names.sort() for i, name in enumerate(names): temp = base_name.replace('.fasta', '_' + name + '.fasta') print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format(name_count[name]), temp), file=obj) fasta_lib.time_stamp_logfile('>>> ending: extract_by_string.py', log_obj) log_obj.close() return
def main(fasta_file, forward=False, reverse=False, both=True, log_obj=None, contam_path=""): """Adds contaminants and reverses entries for a FASTA protein database. Call with single fasta file name. If "forward", make sequences plus contaminants, if "reverse", make reversed sequences with reversed contaminants, if "both", make concatenated target/decoy with contaminants. "contam_path" is optional fullpath name of a contaminants database to use instead of default """ decoy_string = 'REV_' # the string to denote decoy sequences ###################################### # Change default contaminants file name here: CONTAMS = 'Thermo_contams.fasta' # or pass in a "contams_path" ###################################### # open the "forward" and "reversed" output files if fasta_file.lower().endswith('.gz'): _file = os.path.splitext(fasta_file[:-3])[0] else: _file = os.path.splitext(fasta_file)[0] for_name = _file + '_for.fasta' for_file_obj = open(for_name, 'w') rev_name = _file + '_rev.fasta' rev_file_obj = open(rev_name, 'w') # create the name for the concatenated file (if later needed) both_name = _file + '_both.fasta' # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] if not log_obj: log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: reverse_fasta.py', log_obj) # create instances protein object and initialize counter prot = fasta_lib.Protein() p_read = 0 p_contam = 0 # try to find the contaminants database file # If no contam file path provided, search for it in current directory _file = None if not contam_path: if os.path.exists(CONTAMS): _file = CONTAMS else: path = os.path.split(fasta_file)[0] if os.path.exists(os.path.join(path, CONTAMS)): _file = os.path.join(path, CONTAMS) elif os.path.exists(contam_path) and os.path.isfile(contam_path): _file = contam_path elif os.path.isdir(contam_path) and os.path.exists(os.path.join(contam_path, CONTAMS)): _file = os.path.join(contam_path, CONTAMS) # create reader and add contaminants (if contams file was found) if _file: f = fasta_lib.FastaReader(_file) while f.readNextProtein(prot, check_for_errs=True): p_contam += 1 prot.printProtein(for_file_obj) rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) for obj in write: print('...there were %s contaminant entries in %s' % ("{0:,d}".format(p_contam), os.path.split(_file)[1]), file=obj) else: for obj in write: print('...WARNING: contaminants were not added', file=obj) # read proteins until EOF and write proteins to "forward" and "reversed" files f = fasta_lib.FastaReader(fasta_file) # error checking slows program execution, turn on if needed. # Reading and writing sequences always removes spaces and blank lines. while f.readNextProtein(prot, check_for_errs=False): p_read += 1 prot.printProtein(for_file_obj) # write to "forward" file rev = prot.reverseProtein(decoy_string) rev.printProtein(rev_file_obj) # write to "reversed" file for_file_obj.close() rev_file_obj.close() for obj in write: print('...%s proteins read from %s' % ("{0:,d}".format(p_read), os.path.split(fasta_file)[1]), file=obj) # make concatenated output file if desired and print summary stats if both: both_file_obj = open(both_name, 'w') for_file_obj = open(for_name, 'r') rev_file_obj = open(rev_name, 'r') while True: line = for_file_obj.readline() if not line: break both_file_obj.write(str(line)) while True: line = rev_file_obj.readline() if not line: break both_file_obj.write(str(line)) both_file_obj.close() for obj in write: print('...%s total proteins written to %s' % ("{0:,d}".format(2*(p_contam+p_read)), os.path.split(both_name)[1]), file=obj) if forward: for obj in write: print('...%s proteins written to %s' % ("{0:,d}".format(p_contam+p_read), os.path.split(for_name)[1]), file=obj) if reverse: for obj in write: print('...%s proteins reversed and written to %s' % ("{0:,d}".format(p_contam+p_read), os.path.split(rev_name)[1]), file=obj) # close files and delete unwanted files for_file_obj.close() rev_file_obj.close() fasta_lib.time_stamp_logfile('>>> ending: reverse_fasta.py', log_obj) log_obj.close() if not forward: os.remove(for_name) if not reverse: os.remove(rev_name) return
def fasta_counter(fasta_file): """Counts entries in a FASTA protein database. Call with FASTA filename. Checks for duplicate accessions and valid aa characters. Computes protein sequence lengths, molecular weights - writes to TXT file (with DB basename) """ # create a log file to mirror screen output _folder = os.path.split(fasta_file)[0] log_obj = open(os.path.join(_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: count_fasta.py', log_obj) # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot = 0 head = 0 conflict = {} # construct summary file name based on FASTA name if fasta_file.endswith('.fasta.gz'): summary_file = re.sub(r'.fasta.gz$', r'.txt', fasta_file) else: summary_file = re.sub(r'.fasta$', r'.txt', fasta_file) if summary_file == fasta_file: summary_file = fasta_file + '.txt' # open summary file and write header summary_obj = open(summary_file, mode='wt') summary_obj.write('Accession\tLength\tMW\n') # read proteins until EOF; NOTE: checking for errors slows program by factor of 3-4 while f.readNextProtein(p, check_for_errs=True): # count protein sequences prot += 1 if (prot % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot), )) # check for duplicate accession dup = conflict.get(p.accession, False) if dup: for obj in write: print('\n...WARNING: %s is already in FASTA database!\n' % (p.accession, ), file=obj) if p.molwtProtein(show_errs=False) == conflict[p.accession]: print('......possible duplicated sequence...', file=obj) else: conflict[p.accession] = p.molwtProtein(show_errs=False) # count number of header elements control_A = p.description.count(chr(1)) head = head + control_A + 1 # add info to summary_file print('\t'.join([ p.accession, str(p.seqlenProtein()), str(round(p.molwtProtein(), 1)) ]), file=summary_obj) # print results and return for obj in write: print('...there are %s proteins in %s' % ("{0:,d}".format(prot), os.path.split(fasta_file)[1]), file=obj) if head > prot: print('...there were %s header lines' % ("{0:,d}".format(head), ), file=obj) fasta_lib.time_stamp_logfile('>>> ending: count_fasta.py', log_obj) log_obj.close() summary_obj.close() return
def fasta_digester(fasta_file, enzyme='trypsin', low_mass=500.0, high_mass=5000.0, min_length=7, missed_cleavages=2, mass_type='mono', log=None): """Trypsin digests entries in a FASTA protein database. Call with FASTA filename, returns list of proteins with theoretical tryptic digest peptide lists Checks for duplicate accessions and (optional) valid characters. """ print('=======================================================================') print(' fasta_digest_unique.py, v 1.0, written by Phil Wilmarth, OHSU, 2021 ') print('=======================================================================') # compile the regex for desired digestion if enzyme.upper() == 'No_enzyme'.upper(): regex = re.compile(r".") elif enzyme.upper() == 'trypsin'.upper(): # checked regex = re.compile(r".(?:(?<![KR](?!P)).)*") elif enzyme.upper() == 'trypsin-P'.upper(): # checked regex = re.compile(r".(?:(?<![KR]).)*") elif enzyme.upper() == 'Lys-C'.upper(): # checked regex = re.compile(r".(?:(?<![K](?!P)).)*") elif enzyme.upper() == 'Lys-C-P'.upper(): # checked regex = re.compile(r".(?:(?<![K]).)*") elif enzyme.upper() == 'Lys-N'.upper(): # checked regex = re.compile(r".(?:(?![K]).)*") elif enzyme.upper() == 'Arg-C'.upper(): # checked regex = re.compile(r".(?:(?<![R](?!P)).)*") elif enzyme.upper() == 'Asp-N'.upper(): # checked regex = re.compile(r".(?:(?![D]).)*") elif enzyme.upper() == 'CNBr'.upper(): # checked regex = re.compile(r".(?:(?<![M]).)*") elif enzyme.upper() == 'Glu-C'.upper(): # checked regex = re.compile(r".(?:(?<![DE](?!P)).)*") elif enzyme.upper() == 'PepsinA'.upper(): # checked regex = re.compile(r".(?:(?<![FL](?!P)).)*") elif enzyme.upper() == 'chymotrypsin'.upper(): # checked regex = re.compile(r".(?:(?<![FWYL](?!P)).)*") else: print('...WARNING: Enzyme:', enzyme, 'not recognized') regex = None # create instances of reader object and protein object, initialize counters f = fasta_lib.FastaReader(fasta_file) p = fasta_lib.Protein() prot = 0 proteins = [] all_peptides = {} print('starting file reading:', time.ctime()) # read proteins until EOF; NOTE: checking for errors slows program by factor of 3 or 4 while f.readNextProtein(p, check_for_errs=False): # digest protein sequence (regex expression, low mass cutoff, high mas cutoff, # minimum peptide length, maximum number of missed cleavages, type of masses) p.enzymaticDigest(regex, low_mass, high_mass, min_length, missed_cleavages, mass_type) # save all proteins that are read proteins.append(copy.copy(p)) # count protein sequences prot += 1 if (prot % 500000) == 0: print('......(%s proteins read...)' % (prot,)) # print number of proteins/headers print('There are %s proteins in %s' % ("{0:,d}".format(prot), os.path.basename(fasta_file)), file=log) # make shared/unique status dictionary for p in proteins: for pep in p.peptides: # mask I and L residues mass_spec_seq = re.sub(r'[IL]', 'j', pep.seq) # make dictionary of sequences and counts if all_peptides.get(mass_spec_seq): all_peptides[mass_spec_seq].append(p.accession) else: all_peptides[mass_spec_seq] = [p.accession] keys = list(all_peptides.keys()) print(keys[0], all_peptides[keys[0]]) # print table (peptides from each protein, start, end, unique or not, protein list) print('\nAccession\tPeptide\tStart\tEnd\tMass\tMissed_Cleavages\tUnique\tOther_Proteins', file=log) for p in proteins: for pep in p.peptides: out_list = [p.accession] out_list += [pep.seq, str(pep.beg), str(pep.end), '%0.2f' % pep.mass, str(pep.missed)] mass_spec_seq = re.sub(r'[IL]', 'j', pep.seq) if len(all_peptides[mass_spec_seq]) == 1: out_list.append('TRUE') else: out_list.append('FALSE') acc_list = all_peptides[mass_spec_seq] if len(acc_list) == 1: acc_list = [' '] else: acc_list.remove(p.accession) out_list += ['; '.join(acc_list)] # print table rows print('\t'.join(out_list), file=log) return
def main(taxon_dict): """Main program to extract entries by taxon ID from NCBI nr databases. Each gi number (of each header) is looked up to find associated taxon number for comparison to desired taxon numbers. A separate protein entry will be written for each desired taxon number even if all taxon numbers are written to the same output file. At the protein level, the extracted databases may no longer be non-redundant. If "cleaning" of accessions/descriptions is turned off, all headers matching the desired taxon numbers will be added to the respective protein preserving the usual NCBI nr formatting structure. If cleaning of accessions is turned on during extraction, some information may be lost. This could make subsequent database processing (such as extracting by text string) fail. Cleaning is best done as a last step (i.e. in "reverse_fasta.py"). """ print( '====================================================================') print( ' nr_extract_taxon.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print( '====================================================================') # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() nr_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select an NCBI nr database') if nr_file == '': sys.exit() # cancel button response ncbi_folder, nr_name = os.path.split(nr_file) nr_db = os.path.splitext(nr_name)[0] # create a log file to mirror screen output log_obj = open(os.path.join(ncbi_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: nr_extract_taxon.py', log_obj) # get the saved gi number to taxon number {int:int} dictionary acc_to_taxon = fasta_lib.AccToTaxon(ncbi_folder) acc_to_taxon.create_or_load(ncbi_folder) # print the list of taxon numbers that will be extracted original_dict = taxon_dict taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers. NOTE: if a taxon number appears in # "nr_fasta_analyze.txt", it will not be expanded. Either delete the # line in "nr_fasta_analyze.txt", or make an expanded "taxon_dict" by hand. if EXPAND_GROUPS: fasta_lib.expand_species(ncbi_folder, 'nr', taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT, REF_SEQ_ONLY) # open the output databases, initialize counters, etc. taxon_files = {} taxon_count = {} name_count = {} for taxon, name in taxon_dict.items(): fname = nr_db + '_' + name + '.fasta' fname = os.path.join(ncbi_folder, fname) taxon_files[name] = fname name_count[name] = 0 taxon_count[taxon] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # loop over all proteins in nr x = fasta_lib.FastaReader(nr_file) prot = fasta_lib.Protein() prot_read = 0 not_found = 0 skipped = 0 for obj in write: print('...reading %s and extracting entries...' % (nr_name, ), file=obj) # checking for errors slows down program by about a factor of 3 or 4 while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 1000000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) written = {} line = prot.accession + ' ' + prot.description prot.new_desc = '' # extract the gi numbers for each header for header in line.split(chr(1)): accession_with_version = header.split()[0] accession = accession_with_version.split('.')[0] if REF_SEQ_ONLY and '_' not in accession: continue # skip proteins without RefSeq entries taxon = acc_to_taxon.get(accession, False) # see if taxon number for this gi is in our desired list if taxon: if taxon_dict.get(taxon, False): if written.get(taxon, False): # if taxon number already seen, add to header prot = written[taxon] prot.description = prot.description + chr(1) + header written[taxon] = copy.deepcopy(prot) else: # first time taxon number seen name = taxon_dict[taxon] prot.accession = header.split()[0] prot.description = header[len(prot.accession) + 1:] prot.description = prot.description.rstrip() taxon_count[taxon] += 1 name_count[name] += 1 written[taxon] = copy.deepcopy(prot) else: skipped += 1 else: not_found += 1 continue # write a protein sequence for each taxon number it was matched to for taxon in written.keys(): name = taxon_dict[taxon] f = taxon_files[name] prot = written[taxon] prot.new_desc = prot.description prot.new_acc = prot.accession if CLEAN_ACCESSIONS: prot.parseNCBI(REF_SEQ_ONLY) prot.printProtein(f) # print out number of matches and close files for obj in write: print('...%s proteins in %s' % ("{0:,d}".format(prot_read), nr_name), file=obj) print('...%s accessions did not have known taxon numbers' % ("{0:,d}".format(not_found), ), file=obj) print('...%s accessions were skipped (not in our taxon list)' % ("{0:,d}".format(skipped), ), file=obj) if REF_SEQ_ONLY: print('...Extracted sequences are RefSeq Only!!!', file=obj) if VERBOSE: numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon number %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...output file summaries...', file=obj) names = list(taxon_files.keys()) names.sort() for i, name in enumerate(names): print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format( name_count[name]), nr_db + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: nr_extract_taxon.py', log_obj) log_obj.close() for f in taxon_files.values(): f.close() return
def main(taxon_dict): """Main program to extract entries by taxon ID from uniprot databases. Extraction is from a single downloaded Sprot or Trembl database. """ print( '============================================================================' ) print( ' uniprot_extract_from_one.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '============================================================================' ) # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() uniprot_file = fasta_lib.get_file( default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select an Sprot or Trembl database') if uniprot_file == '': sys.exit() # cancel button repsonse uniprot_folder, uniprot_name = os.path.split(uniprot_file) version = uniprot_name.split('_')[-1] version = version.replace('.fasta.gz', '') uniprot_db = uniprot_name.split('_')[1] # create a log file to mirror screen output log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: uniprot_extract_from_one.py', log_obj) # make the smaller uniprot dictionaries (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder) # make the more complete dictionary name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder) # print the list of taxon numbers that will be extracted taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers # NOTE: Any taxon numbers present in analysis text file will not be expanded. if EXPAND_GROUPS: fasta_lib.expand_species(uniprot_folder, uniprot_db, taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT) # inititalize dictionaries and counters taxon_files, taxon_count, name_count = {}, {}, {} for taxon, name in taxon_dict.items(): fname = uniprot_db + '_' + version + '_' + name + '.fasta' fname = os.path.join(uniprot_folder, fname) taxon_files[name] = fname taxon_count[taxon] = 0 name_count[name] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # create a FastaReader object, initialize counters, and start reading x = fasta_lib.FastaReader(uniprot_file) prot = fasta_lib.Protein() prot_read = 0 not_found = 0 duplicates = {} for obj in write: print('...reading %s and extracting entries...' % (uniprot_name, ), file=obj) # checking for errors in sequences slows program execution, use as needed while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) (spec_id, spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' + prot.description) taxon = sci_to_taxon.get(spec_name, 0) # first choice mapping taxon2 = name_to_taxon.get(spec_name, 0) # alternative mapping if taxon == 0: # first choice not present if taxon2 == 0: not_found += 1 else: taxon = taxon2 # use second choice else: if (taxon != taxon2) and ( taxon2 > 0): #keep track of multiple taxon numbers duplicates[spec_name] = (taxon, taxon2) if taxon_dict.get(taxon, False): if CLEAN_ACCESSIONS: prot.parseUniProt() # taxon number matches, so write the protein to the respective file name = taxon_dict[taxon] name_count[name] += 1 taxon_count[taxon] += 1 f = taxon_files[name] prot.printProtein(f) # close the extracted database files for f in taxon_files.values(): f.close() # print list of mis-matching taxon number warnings if MISMATCHES: for i, (name, pair) in enumerate(duplicates.items()): for obj in write: print('......(%s) WARNING: %s and %s map to "%s"' % (i + 1, pair[0], pair[1], name), file=obj) # print out the summary stuff for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), uniprot_name), file=obj) print('...%s proteins had unknown taxon numbers' % (not_found, ), file=obj) if VERBOSE: numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...output file summaries...', file=obj) names = list(taxon_files.keys()) names.sort() for i, name in enumerate(names): print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format(name_count[name]), uniprot_db + '_' + version + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_one.py', log_obj) log_obj.close() return