return (candidates, index) # end # setup stuff: check for command line args, etc. if __name__ == '__main__': # check if database name passed on command line if len(sys.argv) > 1 and os.path.exists(sys.argv[1]): fasta_file = sys.argv[1] # if not, browse to database file else: if len(sys.argv) > 1: print('...WARNING: %s not found...' % (sys.argv[1], )) database = r'C:\Xcalibur\database' if not os.path.exists(database): database = os.getcwd() fasta_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta'), ('Zipped FASTA files', '*.gz'), ('All files', '*.*')], 'Select a FASTA database') if fasta_file == '': sys.exit() # cancel button repsonse # call main function candidates, index = main(fasta_file) # end
# updated for Python 3 -PW 7/6/2017 import os import copy import fasta_lib # print program name and version print('============================================================') print(' program TriTryp_fixer.py, v1.0.2, Phil Wilmarth, OHSU 2017 ') print('============================================================') # browse to the database database = r"C:\Xcalibur\database" if not os.path.exists(database): database = os.getcwd() fasta_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta')], 'Select a TriTryp FASTA database') if fasta_file == '': sys.exit() # cancel button repsonse # build new database name new_fasta_file = os.path.basename(fasta_file) new_fasta_file = new_fasta_file.replace('.fasta', '_fixed.fasta') new_fasta_file = os.path.join(os.path.dirname(fasta_file), new_fasta_file) # initializations proteins = [] p = fasta_lib.Protein() pcount = 0 stop_count = 0 gap_count = 0 no_met = 0
def select_defaults_and_load(self): """Let user browse to a defaults file and load the species.""" self.selected_default = fasta_lib.get_file( self.script_path, [('Text files', '*.txt')], 'Select a default species list file') self.load_defaults()
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Written by Phil Wilmarth, OHSU, 2016 """ import os import sys import time import copy import fasta_lib # control flags KEEP_CONTAMS = False # get the files, etc. list_file_name = fasta_lib.get_file(os.getcwd(), [('Text files', '*.txt')], 'Browse to accession list text file') if not list_file_name: sys.exit() results_location = os.path.split(list_file_name)[0] database_name = fasta_lib.get_file(r'C:\Xcalibur\database', [('FASTA files', '*.fasta')], 'Select the database') if not database_name: sys.exit() new_name = os.path.split(database_name)[1] new_name = os.path.splitext(new_name)[0] subset_DB_name = fasta_lib.save_file(results_location, [('FASTA files', '*.fasta')], default_file=new_name + '_subset.fasta', title_string='Name of subset database') if not subset_DB_name: sys.exit() if os.path.splitext(subset_DB_name)[1] == '': subset_DB_name += '.fasta'
def main(node_taxon): """Program to process taxonomy nodes file and find groups of species. """ print( '=======================================================================' ) print( ' taxon_group_analyzer.py, v1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=======================================================================' ) # get the name of the database analysis text file default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() ext_list = [('Text files', '*.txt'), ('All files', '*.*')] analysis_file = fasta_lib.get_file(default, ext_list, 'Select a species analysis file') if analysis_file == '': sys.exit() # cancel button response analysis_folder, short_file = os.path.split(analysis_file) print('...making taxonomy nodes dictionary...') # may need to check if this works in Python 3 archive_name = os.path.join(analysis_folder, 'taxdump.tar.gz') archive = tarfile.open(archive_name) nodes = archive.extractfile('nodes.dmp') # read file and save taxon to parent taxon mappings taxon_to_parent = {} while True: line = nodes.readline() line = line.decode('utf-8') line = line.rstrip() if not line: break else: line = line.rstrip() item = line.split('\t|\t') taxon_to_parent[int(item[0])] = int(item[1]) nodes.close() # open the fasta_analysis.txt file and find group members print('...scanning %s file...' % (short_file, )) fasta_analyze = open(analysis_file, 'r') out_name = analysis_file.replace('.txt', '_' + str(node_taxon) + '.txt') out_file = open(out_name, 'w') line = fasta_analyze.readline().rstrip() print('Analysis of node:', node_taxon, file=out_file) line = line.replace('A2:', 'A3:') print(line, file=out_file) member = 0 while True: line = fasta_analyze.readline() # read analyze text file line if not line: break else: line = line.rstrip() tree = [] # list of taxon number lineage parent = line.split('\t')[1] try: parent = int(parent) except: continue while parent != 1: # all lineages end with taxon=1 tree.append(parent) try: parent = taxon_to_parent[parent] except KeyError: break tree.append(1) # add last lineage item if node_taxon in tree: # see if desired node is anywhere in the list member += 1 print(line, file=out_file) # write lines of node members # fasta_analyze.close() out_file.close() print('...taxonomy node %s had %s members...' % (node_taxon, member)) return
def browse_contams(self): """Dialog to browse to non-default contaminants database.""" self.contams_database = fasta_lib.get_file( self.script_path, [('Fasta files', '*.fasta')], "Select a contaminants FASTA file") self.contams_label.config(text=os.path.split(self.contams_database)[1])
if os.path.exists(sys.argv[3]): output_file = sys.argv[3] # if not, get database files with dialog boxes else: if len(sys.argv) > 1: for i, db in enumerate([extra_file, fasta_file, output_file]): if not db: print('...WARNING: %s not found...' % (sys.argv[i+1],)) database = r'C:\Xcalibur\database' if not os.path.exists(database): database = os.getcwd() print('Select the FASTA file with extra sequences') extra_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta'), ('All files', '*.*')], 'Select Extra Sequences (FASTA format)') if extra_file == '': sys.exit() # cancel button response extra_name = os.path.split(extra_file)[1] extra_name = extra_name.split('.fasta')[0] print('Select the main FASTA file') fasta_file = fasta_lib.get_file(database, [('FASTA files', '*.fasta'), ('GZipped files', '*.gz'), ('All files', '*.*')], 'Select FASTA database file') if fasta_file == '': sys.exit() # cancel button response default = os.path.split(fasta_file)[0] fasta_name = os.path.split(fasta_file)[1] default_file = extra_name + '_' + fasta_name
def main(string_dict): """Main program to extract entries containing strings from databases. Simple string search of pattern in combined accession/description lines. Logical OR if more than one pattern is mapped to the same outfile. Each matching protein is written once per output file with possible compound header (nr) of all headers containing matching patterns. If "cleaning" of accessions/descriptions is turned on for NCBI nr databases, only the first header element will be retained and any accession number cross-references will be lost. Written by Phil Wilmarth, OHSU, 2009. """ print( '=====================================================================' ) print( ' extract_by_string.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '=====================================================================' ) # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() db_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select a FASTSA database') if db_file == '': sys.exit() # cancel button repsonse db_folder, db_name = os.path.split(db_file) base_name = db_name.replace('.gz', '') if not base_name.endswith('.fasta'): base_name = base_name + '.fasta' # create a log file to mirror screen output log_obj = open(os.path.join(db_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: extract_by_string.py', log_obj) # print the list of patterns that will be extracted string_list = list(string_dict.items()) string_list.sort() for obj in write: print('...extracting entries containing these strings:', file=obj) for i, t in enumerate(string_list): print('......(%s) string "%s" to file ending in "%s"' % (i + 1, t[0], t[1]), file=obj) # open the output databases, initialize counters string_files = {} string_count = {} name_count = {} for string, name in string_dict.items(): fname = base_name.replace('.fasta', '_' + name + '.fasta') fname = os.path.join(db_folder, fname) string_files[name] = fname string_count[string] = 0 name_count[name] = 0 for name in string_files.keys(): string_files[name] = open(string_files[name], 'w') # create a FastaReader object, initialize counters, and start reading x = fasta_lib.FastaReader(db_file) prot = fasta_lib.Protein() prot_read = 0 for obj in write: print('...reading %s and extracting entries...' % (db_name, ), file=obj) while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) written = {} # make sure protein is written only ONCE per OUTFILE header = prot.accession + ' ' + prot.description # recreate the '>' line if not CASE_SENSITIVE: # convert to uppercase header = header.upper() for pattern in string_dict.keys(): new_pattern = pattern if not CASE_SENSITIVE: # case insensitive matching new_pattern = new_pattern.upper() for head in header.split(chr(1)): # check each header for matches if new_pattern in head: name = string_dict[pattern] name_header = written.get(name, '') if name_header: name_header = name_header + chr(1) + head written[name] = name_header else: written[name] = head string_count[pattern] += 1 # write any matching proteins to appropriate out file for name in written.keys(): name_count[name] += 1 # output file write counters f = string_files[name] # output file pointers header = written[name] # composite header of name's matches # set the accession and description fields before writing prot.accession = header.split()[0] prot.new_acc = prot.accession prot.description = header[(len(prot.accession) + 1):] prot.new_desc = prot.description if CLEAN_ACCESSIONS: if prot.accession.startswith('gi|'): prot.parseNCBI(REF_SEQ_ONLY) elif prot.accession.startswith( 'sp|') or prot.accession.startswith('tr|'): prot.parseUniProt(KEEP_UNIPROT_ID) prot.printProtein(f) # write any matching proteins # close files for f in string_files.values(): f.close() # print out the summary stuff for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), db_name), file=obj) strings = list(string_count.keys()) strings.sort() for i, string in enumerate(strings): print('......(%s) pattern "%s" was found in %s proteins' % (i + 1, string, "{0:,d}".format(string_count[string])), file=obj) print('...output file summaries...', file=obj) names = list(string_files.keys()) names.sort() for i, name in enumerate(names): temp = base_name.replace('.fasta', '_' + name + '.fasta') print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format(name_count[name]), temp), file=obj) fasta_lib.time_stamp_logfile('>>> ending: extract_by_string.py', log_obj) log_obj.close() return
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.""" import os import sys import copy import fasta_lib # bit scores of 75-100 seem to be above random scores MIN_BIT = 100.0 # get the two PAW results databases to blast against each other default = os.getcwd() print('Select the BLAST mapping file file') anno_file = fasta_lib.get_file(default, [('TXT files', '*.txt')], 'Select mapping file') if not anno_file: sys.exit() # cancel button was hit default = os.path.dirname(anno_file) print('Select the FASTA file') orig_database = fasta_lib.get_file(default, [('FASTA files', '*.fasta')], 'Select database') if not orig_database: sys.exit() # cancel button was hit new_database = orig_database.replace('.fasta', '_fixed.fasta') # echo database names to console output print('Mapping file:', os.path.basename(anno_file)) print('Original database:', os.path.basename(orig_database)) print('New database:', os.path.basename(new_database))
def main(taxon_dict): """Main program to extract entries by taxon ID from NCBI nr databases. Each gi number (of each header) is looked up to find associated taxon number for comparison to desired taxon numbers. A separate protein entry will be written for each desired taxon number even if all taxon numbers are written to the same output file. At the protein level, the extracted databases may no longer be non-redundant. If "cleaning" of accessions/descriptions is turned off, all headers matching the desired taxon numbers will be added to the respective protein preserving the usual NCBI nr formatting structure. If cleaning of accessions is turned on during extraction, some information may be lost. This could make subsequent database processing (such as extracting by text string) fail. Cleaning is best done as a last step (i.e. in "reverse_fasta.py"). """ print( '====================================================================') print( ' nr_extract_taxon.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ') print( '====================================================================') # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() nr_file = fasta_lib.get_file(default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select an NCBI nr database') if nr_file == '': sys.exit() # cancel button response ncbi_folder, nr_name = os.path.split(nr_file) nr_db = os.path.splitext(nr_name)[0] # create a log file to mirror screen output log_obj = open(os.path.join(ncbi_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: nr_extract_taxon.py', log_obj) # get the saved gi number to taxon number {int:int} dictionary acc_to_taxon = fasta_lib.AccToTaxon(ncbi_folder) acc_to_taxon.create_or_load(ncbi_folder) # print the list of taxon numbers that will be extracted original_dict = taxon_dict taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers. NOTE: if a taxon number appears in # "nr_fasta_analyze.txt", it will not be expanded. Either delete the # line in "nr_fasta_analyze.txt", or make an expanded "taxon_dict" by hand. if EXPAND_GROUPS: fasta_lib.expand_species(ncbi_folder, 'nr', taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT, REF_SEQ_ONLY) # open the output databases, initialize counters, etc. taxon_files = {} taxon_count = {} name_count = {} for taxon, name in taxon_dict.items(): fname = nr_db + '_' + name + '.fasta' fname = os.path.join(ncbi_folder, fname) taxon_files[name] = fname name_count[name] = 0 taxon_count[taxon] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # loop over all proteins in nr x = fasta_lib.FastaReader(nr_file) prot = fasta_lib.Protein() prot_read = 0 not_found = 0 skipped = 0 for obj in write: print('...reading %s and extracting entries...' % (nr_name, ), file=obj) # checking for errors slows down program by about a factor of 3 or 4 while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 1000000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) written = {} line = prot.accession + ' ' + prot.description prot.new_desc = '' # extract the gi numbers for each header for header in line.split(chr(1)): accession_with_version = header.split()[0] accession = accession_with_version.split('.')[0] if REF_SEQ_ONLY and '_' not in accession: continue # skip proteins without RefSeq entries taxon = acc_to_taxon.get(accession, False) # see if taxon number for this gi is in our desired list if taxon: if taxon_dict.get(taxon, False): if written.get(taxon, False): # if taxon number already seen, add to header prot = written[taxon] prot.description = prot.description + chr(1) + header written[taxon] = copy.deepcopy(prot) else: # first time taxon number seen name = taxon_dict[taxon] prot.accession = header.split()[0] prot.description = header[len(prot.accession) + 1:] prot.description = prot.description.rstrip() taxon_count[taxon] += 1 name_count[name] += 1 written[taxon] = copy.deepcopy(prot) else: skipped += 1 else: not_found += 1 continue # write a protein sequence for each taxon number it was matched to for taxon in written.keys(): name = taxon_dict[taxon] f = taxon_files[name] prot = written[taxon] prot.new_desc = prot.description prot.new_acc = prot.accession if CLEAN_ACCESSIONS: prot.parseNCBI(REF_SEQ_ONLY) prot.printProtein(f) # print out number of matches and close files for obj in write: print('...%s proteins in %s' % ("{0:,d}".format(prot_read), nr_name), file=obj) print('...%s accessions did not have known taxon numbers' % ("{0:,d}".format(not_found), ), file=obj) print('...%s accessions were skipped (not in our taxon list)' % ("{0:,d}".format(skipped), ), file=obj) if REF_SEQ_ONLY: print('...Extracted sequences are RefSeq Only!!!', file=obj) if VERBOSE: numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon number %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...output file summaries...', file=obj) names = list(taxon_files.keys()) names.sort() for i, name in enumerate(names): print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format( name_count[name]), nr_db + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: nr_extract_taxon.py', log_obj) log_obj.close() for f in taxon_files.values(): f.close() return
def main(taxon_dict): """Main program to extract entries by taxon ID from uniprot databases. Extraction is from a single downloaded Sprot or Trembl database. """ print( '============================================================================' ) print( ' uniprot_extract_from_one.py, v.1.1.0, written by Phil Wilmarth, OHSU, 2017 ' ) print( '============================================================================' ) # set some file paths and names default = r'C:\Xcalibur\database' if not os.path.exists(default): default = os.getcwd() uniprot_file = fasta_lib.get_file( default, [('Zipped files', '*.gz'), ('Fasta files', '*.fasta')], title_string='Select an Sprot or Trembl database') if uniprot_file == '': sys.exit() # cancel button repsonse uniprot_folder, uniprot_name = os.path.split(uniprot_file) version = uniprot_name.split('_')[-1] version = version.replace('.fasta.gz', '') uniprot_db = uniprot_name.split('_')[1] # create a log file to mirror screen output log_obj = open(os.path.join(uniprot_folder, 'fasta_utilities.log'), 'a') write = [None, log_obj] fasta_lib.time_stamp_logfile('\n>>> starting: uniprot_extract_from_one.py', log_obj) # make the smaller uniprot dictionaries (sci_to_taxon, id_to_taxon) = fasta_lib.make_uniprot_to_taxon(uniprot_folder) # make the more complete dictionary name_to_taxon = fasta_lib.make_all_names_to_taxon(uniprot_folder) # print the list of taxon numbers that will be extracted taxon_list = list(taxon_dict.items()) taxon_list.sort() for obj in write: print('...extracting these taxon numbers:', file=obj) for i, t in enumerate(taxon_list): print('......(%s) taxon %s to file tagged with "%s"' % (i + 1, t[0], t[1]), file=obj) # expand any group taxon numbers # NOTE: Any taxon numbers present in analysis text file will not be expanded. if EXPAND_GROUPS: fasta_lib.expand_species(uniprot_folder, uniprot_db, taxon_dict, MIN_SEQUENCE_COUNT, MIN_GROUP_SEQ_COUNT) # inititalize dictionaries and counters taxon_files, taxon_count, name_count = {}, {}, {} for taxon, name in taxon_dict.items(): fname = uniprot_db + '_' + version + '_' + name + '.fasta' fname = os.path.join(uniprot_folder, fname) taxon_files[name] = fname taxon_count[taxon] = 0 name_count[name] = 0 # open the output filenames for name in taxon_files.keys(): taxon_files[name] = open(taxon_files[name], 'w') # create a FastaReader object, initialize counters, and start reading x = fasta_lib.FastaReader(uniprot_file) prot = fasta_lib.Protein() prot_read = 0 not_found = 0 duplicates = {} for obj in write: print('...reading %s and extracting entries...' % (uniprot_name, ), file=obj) # checking for errors in sequences slows program execution, use as needed while x.readNextProtein(prot, check_for_errs=False): prot_read += 1 if (prot_read % 500000) == 0: print('......(%s proteins read...)' % ("{0:,d}".format(prot_read), )) (spec_id, spec_name) = fasta_lib.uniprot_parse_line(prot.accession + ' ' + prot.description) taxon = sci_to_taxon.get(spec_name, 0) # first choice mapping taxon2 = name_to_taxon.get(spec_name, 0) # alternative mapping if taxon == 0: # first choice not present if taxon2 == 0: not_found += 1 else: taxon = taxon2 # use second choice else: if (taxon != taxon2) and ( taxon2 > 0): #keep track of multiple taxon numbers duplicates[spec_name] = (taxon, taxon2) if taxon_dict.get(taxon, False): if CLEAN_ACCESSIONS: prot.parseUniProt() # taxon number matches, so write the protein to the respective file name = taxon_dict[taxon] name_count[name] += 1 taxon_count[taxon] += 1 f = taxon_files[name] prot.printProtein(f) # close the extracted database files for f in taxon_files.values(): f.close() # print list of mis-matching taxon number warnings if MISMATCHES: for i, (name, pair) in enumerate(duplicates.items()): for obj in write: print('......(%s) WARNING: %s and %s map to "%s"' % (i + 1, pair[0], pair[1], name), file=obj) # print out the summary stuff for obj in write: print('...%s protein entries in %s' % ("{0:,d}".format(prot_read), uniprot_name), file=obj) print('...%s proteins had unknown taxon numbers' % (not_found, ), file=obj) if VERBOSE: numbers = list(taxon_count.keys()) numbers.sort() for i, number in enumerate(numbers): if taxon_count[number] > 0: print( '......(%s) taxon %s had %s proteins' % (i + 1, number, "{0:,d}".format(taxon_count[number])), file=obj) print('...output file summaries...', file=obj) names = list(taxon_files.keys()) names.sort() for i, name in enumerate(names): print('......(%s) %s proteins extracted and written to %s' % (i + 1, "{0:,d}".format(name_count[name]), uniprot_db + '_' + version + '_' + name + '.fasta'), file=obj) fasta_lib.time_stamp_logfile('>>> ending: uniprot_extract_from_one.py', log_obj) log_obj.close() return