def get_db_filename_for_query_from_db_csv(taxon, main_data_dir): """Take a database name/species abbreviation or taxon name extracted from a query filename, and if there is a corresponding database file name in the database directory information csv file specified in the DataPaths(main_data_dir) module, then return that. Otherwise, just return 'N/A'. """ # Define database file name as not applicable, by default. dbfn = '-' # Check whether the given "taxon" name exists in the database info csv. try: # Try loading the dataframe. df = pd.read_csv(DataPaths(main_data_dir).db_info_csv, encoding='utf-8') except: # Print an error message. print( """Error: Could not load contents of csv file as pandas dataframe:\n\n \t%s\n\nCheck that the file was saved properly in comma separated value format (UTF-8 encoding).""" % DataPaths(main_data_dir).db_info_csv) # Exit the script. print('Quitting script.') sys.exit() col_list = df['Taxon'].tolist() # If it does, then get filename for corresponding database. if taxon in col_list: df.set_index('Taxon', inplace=True) dbfn = df.loc[taxon]['Filename'] # Return database file name or 'N/A'. return dbfn
def get_query_title_from_csv(query_filename, main_data_dir): """Take a query filename, look up corresponding query title in the query directory csv specified in the DataPaths(main_data_dir) module, and return that. """ # Parse query info csv file. query_info_csv_path = DataPaths(main_data_dir).query_info_csv df = pd.read_csv(query_info_csv_path, encoding='utf-8') # Get query title. df.set_index('Filename', inplace=True) query_title = df.loc[query_filename]['Query title'] # Check that query title is not a dataframe. assert not isinstance(query_title, pd.DataFrame), """Could not properly parse query info file (%s), there may be duplicate entries in this file.""" \ % query_info_csv_path # Check that query title is a string. assert isinstance(query_title, str), """Could not properly identify query title for query file %s in CSV file %s. Please ensure that the query title contains some alphabetic characters. Query title identified: %s""" \ % (query_filename, query_info_csv_path, query_title) # Return query title. return query_title
def get_subseq_from_fasta_db(db_name, acc, subseq_coord, main_data_dir): """Returns a SeqRecord object corresponding to the subsequence with the given coordinates in the sequence with the given accessions in the given database file. Note: the input subsequence coordinates ('subseq_coord') are the start and end residue numbers for the subsequence, not python-style slices. """ # Get database directory from DataPaths(main_data_dir) module. db_dir = DataPaths(main_data_dir).dbdirpath # Get database filepath. #db_path = os.path.join(db_dir, db_name) db_path = os.path.join(db_dir, os.path.basename(db_name)) assert os.path.isfile(db_path), """Path is not a file: %s""" % db_path # Old way: ## Parse database file and compile a list of sequence objects to return. #seq_objs = [] #with open(db_path) as dbh: # for seq in SeqIO.parse(dbh, 'fasta'): # acc = seq.id.strip() # if acc in accs: # seq_objs.append(seq) # if len(seq_objs) == len(accs): # break # Use esl-sfetch to retrieve the sequences and write to a temporary file. temp_fa_path = db_path + '_TEMP_FASTA.fa' if os.path.isfile(temp_fa_path): os.remove(temp_fa_path) with open(temp_fa_path, 'a') as o: # Get sequence as text. subprocess.call(['esl-sfetch', db_path, acc], stdout=o) # Parse the fasta file to get Seq objects. seq_obj = None seq_obj = SeqIO.read(temp_fa_path, 'fasta') seq_obj.description = seq_obj.description.rstrip('\"') + ' ' + str( subseq_coord) + '\"' # ***Re-use code from search scaffolds to verify validity of input # subseq_coord...? #... # Construct new sequence. new_seq = '' for subseq in subseq_coord: start = subseq[0] end = subseq[1] new_seq = new_seq + seq_obj.seq[start:end + 1] # Double-check this! seq_obj.seq = new_seq # Remove the temporary fasta file. os.remove(temp_fa_path) # Return the list of sequence objects. return seq_obj
def get_query_taxon_from_csv(query_filename, main_data_dir): """Take a query filename, look up corresponding query taxon in the query directory csv specified in the DataPaths(main_data_dir) module, and return that. """ # Parse query info csv file. df = pd.read_csv(DataPaths(main_data_dir).query_info_csv, encoding='utf-8') # Return query title. df.set_index('Filename', inplace=True) return df.loc[query_filename]['Query taxon (species if applicable)']
def get_species_for_db_filename(db_filename, main_data_dir): """Takes a database filename, and returns the species name that appears in the database info csv file (may be '-' if not applicable). """ df = pd.read_csv(DataPaths(main_data_dir).db_info_csv, encoding='utf-8') df.set_index('Filename', inplace=True) sp = df.loc[db_filename]['Species (if applicable)'] #print('\nTrying to get species name from genome info csv file.') #print('genome info file path: ' + DataPaths(main_data_dir).db_info_csv) #print('db_filename: ' + db_filename) #print('value in species column: ' + sp) # Check that the value retrieved makes sense. assert type(sp) is str, """There is more than one entry (row) for the filename %s in the file %s.""" % (db_filename, DataPaths(main_data_dir).db_info_csv) # Return the species name from the spreadsheet. return sp
def get_species_from_db_csv(taxon, main_data_dir): """Take a database name/species abbreviation from a taxon name extracted from a query filename. If there is a corresponding species name in the database directory information csv file specified in the DataPaths(main_data_dir) module return that. """ df = pd.read_csv(DataPaths(main_data_dir).db_info_csv, encoding='utf-8') # Species name to return is not applicable by default. sp = '-' for f in list(df['Filename']): if f.rsplit('.', 1)[0] == taxon: df.set_index('Filename', inplace=True) sp = df.loc[f]['Species (if applicable)'] break return sp
def hit_sequence(self, hit_rank): """Return a Seq object for full sequence of subject sequence. """ # Get path for databases directory. dbdir_path = DataPaths(self.main_data_dir).dbdirpath # Get sequence object. seq_obj = None seq_id = self.hit_id(hit_rank) db_path = os.path.join(dbdir_path, self.db_file) assert os.path.isfile(db_path), """Given path is not a file.""" #seq_obj = get_seqs_from_fasta_db(db_path, [seq_id])[0] seq_obj = get_seqs_from_fasta_db(self.db_file, [seq_id], self.main_data_dir)[0] # Check that it worked. assert seq_obj is not None, """Could not retrieve sequence for hit.""" # Return sequence object. return seq_obj
stdout=o, stderr=subprocess.STDOUT) if __name__ == '__main__': # Parse input. command_line_list = sys.argv query_faa = str(command_line_list[1]) target_fna_name = str(command_line_list[2]) target_seq_id = str(command_line_list[3]) target_subseq_start = str(command_line_list[4]) target_subseq_end = str(command_line_list[5]) genetic_code = str(command_line_list[6]) # Get filepath for specified query FASTA filename. query_dir = DataPaths(main_data_dir).querydirpath query_faa_path = os.path.join(query_dir, query_faa) assert os.path.isfile(query_faa_path), """Specified query file path is not a file: %s""" % query_faa_path # Get filepath for specified subject FASTA filename. db_dir = DataPaths(main_data_dir).dbdirpath target_fna_path = os.path.join(db_dir, target_fna_name) assert os.path.isfile(target_fna_path), """Specified database file path is not a file: %s""" % target_fna_path # Define path to FASTA file with subsequence of interest from target # nucleotide sequence. subseq_fasta_path = query_faa.rsplit('.', 1)[0] + '_subject_subseq.fna' # Extract relevant subsequence from input target sequence (region identified in
def __init__(self, filepath, main_data_dir): # Check that the input file path exists. assert os.path.isfile(filepath), """Input filepath does not exist: %s""" % filepath self.filepath = filepath self.main_data_dir = main_data_dir # Get basic info from file. info = get_srch_file_info(filepath) # Determine program used to generate input file. self.program = None self.program = info[0] assert self.program is not None, """Could not determine the name of the program that produced the similarity search result file: %s"""\ % filepath # Determine version of program used to generate input file. self.version = None self.version = info[1] assert self.version is not None, """Could not determine the version of the program that produced the similarity search result file: %s"""\ % filepath # Determine format type of input file. self.format = None self.format = info[2] assert self.format is not None, """Could not determine the name of the format type of the similarity search result file: %s"""\ % filepath assert self.format != 'hmmer3-tab', """Does not work with tabular format.""" # Check that the file contains only results for a search with a single # query. if not self.format == 'hhsearch': assert len(list(SearchIO.parse(filepath, self.format))) == 1, """More than one search result contained in input file: %s""" % filepath else: pass # ... # Define a list of SearchIO Hit objects. self.hits = None if not self.format == 'hhsearch': self.hits = SearchIO.read(self.filepath, self.format) # Determine number of hits in input file. self.num_hits = None if not self.format == 'hhsearch': self.num_hits = len(self.hits) assert self.num_hits is not None, """Could not determine the number of hits listed in the similarity search result file: %s"""\ % filepath else: pass # ... #assert self.num_hits >= 0, """Could not determine the number of #hits listed in the similarity search result file: %s"""\ #% filepath # Get the query and database file paths. #self.query_file = None self.db_file = None if not self.format == 'hhsearch': p = self.hits self.db_file = os.path.basename(p.target) else: pass # ... #assert self.query_file is not None, """Could not determine query file #name listed in the similarity search result file: %s"""\ #% filepath assert self.db_file is not None, """Could not determine database file name listed in the similarity search result file: %s"""\ % filepath # Define full path to database file. self.db_file_path = None self.db_file_path = os.path.join( DataPaths(self.main_data_dir).dbdirpath, self.db_file) # Check that it is a real file. assert os.path.isfile( self.db_file_path), """Path to database is not a file: %s""" % self.db_file_path # Handle hmmsearch results differently. if self.format == 'hmmer3-text': # Re-order hits by ascending E-value of best 1 domain # (otherwise sequences with multiple repetitive domains may be # retrieved with lower E-values despite low sequence similarity # of each of the constituent domains with the query HMM). # For this it is necessary to choose the minimum of the per domain # E-values for each hit as the one to use for sorting. self.hits = list(self.hits) self.hits.sort(key=lambda x: min([y.evalue for y in list(x)]))
def hit_subsequence_and_coord(self, hit_rank, max_gap=10000): """Return a Seq object for subsequence of subject sequence that actually aligns to the query sequence/profile. """ subseq_obj = None subseq_coord = None # Get hit object with SearchIO parser. searchio_hit_obj = None hit_num = -1 for hit in self.hits: hit_num += 1 if hit_num == hit_rank: searchio_hit_obj = hit break # Check that hit object was retreived. assert searchio_hit_obj != None, """Could not retrieve Bio.SearchIO hit object from file.""" # Process SearchIO hit object to get subsequence object and # coordinates differently depending on format. if self.format == 'blast-xml': # Need to concatenate HSPs in a logical manner, and differently for # blastp vs. tblastn. if self.program == 'blastp': # Use the search_scaffolds module. xlist =\ get_blastp_hit_seq_obj_and_coord(searchio_hit_obj, max_gap) subseq_obj = xlist[0] subseq_coord = xlist[1] elif self.program == 'tblastn': # Use the search_scaffolds module. xlist =\ get_tblastn_hit_seq_obj_and_coord(searchio_hit_obj, max_gap) subseq_obj = xlist[0] subseq_coord = xlist[1] elif self.format == 'hmmer3-text': # More straight-forward, because the sequences can be taken # directly from the database file. # Get path for databases directory. dbdir_path = DataPaths(self.main_data_dir).dbdirpath # Get coordinates. subseq_coord = get_hmmer_hit_seq_coord(searchio_hit_obj, self.db_file, self.main_data_dir) # Get sequence object. seq_id = self.hit_id(hit_rank) db_path = os.path.join(dbdir_path, self.db_file) subseq_obj = get_subseq_from_fasta_db(db_path, seq_id, subseq_coord, self.main_data_dir) else: pass # ...? # Check that it worked. assert subseq_obj is not None, """Could not retrieve sequence for hit.""" assert subseq_coord is not None, """Could not retrieve sequence coordinates for hit.""" # Return sequence object and coordinates. return [subseq_obj, subseq_coord]
def get_seqs_from_fasta_db(db_name, accs, main_data_dir, slow=False): """Returns a list of SeqRecord objects corresponding to the given accessions in the given database file. """ # Get database directory from DataPaths(main_data_dir) module. db_dir = DataPaths(main_data_dir).dbdirpath # Get database filepath. #db_path = os.path.join(db_dir, db_name) db_path = os.path.join(db_dir, os.path.basename(db_name)) assert os.path.isfile(db_path), """Path is not a file: %s""" % db_path # Old way: ## Parse database file and compile a list of sequence objects to return. #seq_objs = [] #with open(db_path) as dbh: # for seq in SeqIO.parse(dbh, 'fasta'): # acc = seq.id.strip() # if acc in accs: # seq_objs.append(seq) # if len(seq_objs) == len(accs): # break # Retrieve sequences from fasta file and write to a temporary file. temp_fa_path = db_path + '_TEMP_FASTA.fa' if not slow: # Use esl-sfetch to retrieve the sequences and write to a temporary file. if os.path.isfile(temp_fa_path): os.remove(temp_fa_path) with open(temp_fa_path, 'a') as o: for acc in accs: # Get sequence as text. subprocess.call(['esl-sfetch', db_path, acc], stdout=o) elif slow: # Parse sequence using a slower method that does not make use of # esl-sfetch. if os.path.isfile(temp_fa_path): os.remove(temp_fa_path) with open(temp_fa_path, 'a') as o, open(db_path) as db_handle: for acc in accs: # Get sequence as text. all_seq_ids = None with open(db_path) as db_handle: all_seq_ids = [ x.id for x in SeqIO.parse(db_handle, 'fasta') ] if acc in all_seq_ids: with open(db_path) as db_handle: for x in SeqIO.parse(db_handle, 'fasta'): if x.id == acc: # Write to temp fasta file. SeqIO.write([x], o, 'fasta') break else: accs_that_start_with_acc = [] with open(db_path) as db_handle: for x in SeqIO.parse(db_handle, 'fasta'): if x.id.startswith(acc): accs_that_start_with_acc.append(x.id) # Check that only one accession starts with. #assert len(accs_that_start_with_acc) == 1, """More than one #accession in file starts with %s""" % acc if len(accs_that_start_with_acc) < 1: print("No accessions start with %s" % acc) elif len(accs_that_start_with_acc) > 1: print("More than one accession starts with %s" % acc) if len(accs_that_start_with_acc) >= 1: with open(db_path) as db_handle: for x in SeqIO.parse(db_handle, 'fasta'): if x.id.startswith(acc): # Write to temp fasta file. SeqIO.write([x], o, 'fasta') break # Parse the fasta file to get Seq objects. seq_objs = [] for s in SeqIO.parse(temp_fa_path, 'fasta'): seq_objs.append(s) # Remove the temporary fasta file. os.remove(temp_fa_path) # Return the list of sequence objects. return seq_objs
def run_all_searches(query_file_list, db_file_list, outdir, blast_report_evalue_cutoff, blast_max_target_seqs, hmmer_report_evalue_cutoff, hmmer_report_score_cutoff, num_threads_similarity_searching, main_data_dir, query_dir=None): """Search with every query file in a given list into every database file in another given list using appropriate methods. """ # Current time. start_time = time.time() # Get query and database directories from DataPaths(main_data_dir). if query_dir == None: query_dir = DataPaths(main_data_dir).querydirpath db_dir = DataPaths(main_data_dir).dbdirpath # Write a query file list file to output directory. out_query_file = get_out_query_list_path(outdir) with open(out_query_file, 'w') as o: for q in query_file_list: o.write(q + '\n') # Write a database file list file to output directory. out_db_file = get_out_db_list_path(outdir) with open(out_db_file, 'w') as o: for d in db_file_list: o.write(d + '\n') # Create a log file. logfile = os.path.join(outdir, '0_search_log.txt') # Loop over each query-database pair. with open(logfile, 'w') as o: srch_num = 0 # Loop over query files. for q in query_file_list: # Loop over database files. for d in db_file_list: # Check that database file is a single FASTA file. assert '.faa' not in d.rsplit('.', 1)[0], """The database file name %s does not appear to be formatted correctly. This may have resulted from a file parsing error.""" % d if q.rsplit('.', 1)[1] == 'afaa' and d.rsplit('.', 1)[1] == 'fna': warning_text = """\nWARNING: Not searching with profile query %s in nucleotide data %s\n\n""" % (q, d) print(warning_text) o.write(warning_text) else: srch_num += 1 # Get name of output file. #print(outdir) outfile = search_result_filepath(q, d, outdir) #print(outfile) # Get full filepaths, and verify existence. qfull = None #if os.path.isfile(q): # qfull = q #else: # qfull = os.path.join(query_dir, q) qfull = os.path.join(query_dir, q) assert os.path.isfile( qfull), """Specified query file path is not a file: %s""" % qfull dfull = None #if os.path.isfile(d): # dfull = d #else: # dfull = os.path.join(db_dir, d) dfull = os.path.join(db_dir, d) assert os.path.isfile( dfull), """Specified database file path is not a file: %s\n Please ensure that a FASTA file with the filename %s exists in the input data.""" % (dfull, d) # Search start time. search_start_time = time.time() # Run the similarity search and get a description of the search # command (write to a log file?). command_descr = run_any_search( qfull, dfull, outfile, blast_report_evalue_cutoff, blast_max_target_seqs, hmmer_report_evalue_cutoff, hmmer_report_score_cutoff, num_threads_similarity_searching) # Write description of search to log file. o.write(command_descr + '\n') # End time. search_end_time = time.time() # Record time elapsed. search_elapsed = search_end_time - search_start_time o.write('Run time: ' + str(datetime.timedelta(seconds=search_elapsed)) + '\n') # End time. end_time = time.time() # Record time elapsed. elapsed = end_time - start_time o.write('Total run time: ' + str(datetime.timedelta(seconds=elapsed)) + '\n')
def run_any_search(queryfile, dbfile, outfile, blast_report_evalue_cutoff, blast_max_target_seqs, hmmer_report_evalue_cutoff, hmmer_report_score_cutoff, num_threads_similarity_searching): """Run similarity search. Import info from DataPaths(main_data_dir).py to specify options for running external software? For example, number of threads. ***Need to refactor functions in the module_nhmmer_search module? """ # Determine method to use based on the input file types. query_exten = queryfile.rsplit('.', 1)[1] dbfile_exten = dbfile.rsplit('.', 1)[1] method = determine_search_method(query_exten, dbfile_exten) # Get version number for software. #version = 'version' #version = get_search_software_version(method) # Get relevant DataPaths(main_data_dir). # Get cutoffs for recording hits. blast_evalcut = str(blast_report_evalue_cutoff) blast_max_target_seqs = str(blast_max_target_seqs) hmmer_evalcut = str(hmmer_report_evalue_cutoff) hmmer_scorecut = str(hmmer_report_score_cutoff) # Get number of threads to use. num_threads = str(num_threads_similarity_searching) # Construct search command. run_command = [] if method == 'blastp': run_command = [ method, '-query', queryfile, '-db', dbfile, '-out', outfile, '-num_threads', num_threads, '-outfmt', '5', '-evalue', blast_evalcut, '-max_target_seqs', blast_max_target_seqs ] elif method == 'tblastn': # Set the genetic code. tblastn_ncbi_gen_code = None try: tblastn_ncbi_gen_code = DataPaths( main_data_dir).tblastn_ncbi_gen_code except: tblastn_ncbi_gen_code = '1' run_command = [ method, '-query', queryfile, '-db', dbfile, '-out', outfile, '-num_threads', num_threads, '-outfmt', '5', '-evalue', blast_evalcut, '-max_target_seqs', blast_max_target_seqs, '-db_gencode', tblastn_ncbi_gen_code ] elif method == 'blastx': run_command = [ method, '-query', queryfile, '-db', dbfile, '-out', outfile, '-num_threads', num_threads, '-outfmt', '5', '-evalue', blast_evalcut, '-max_target_seqs', blast_max_target_seqs ] elif method == 'blastn': run_command = [ method, '-query', queryfile, '-db', dbfile, '-out', outfile, '-num_threads', num_threads, '-outfmt', '5', '-evalue', blast_evalcut, '-max_target_seqs', blast_max_target_seqs ] elif method == 'hmmsearch': # Use HMM file rather than '.afaa' file. actual_queryfile = get_out_hmm_path(queryfile) #run_command = [method, "-T", hmmer_scorecut, "--cpu", num_threads, # '--tblout', outfile, actual_queryfile, dbfile] run_command = [ method, "-T", hmmer_scorecut, "--cpu", num_threads, '-o', outfile, actual_queryfile, dbfile ] elif method == 'hmmscan': #run_command = [method, "-T", hmmer_scorecut, "--cpu", num_threads, # '--tblout', outfile, queryfile, dbfile] run_command = [ method, "-T", hmmer_scorecut, "--cpu", num_threads, '-o', outfile, dbfile, queryfile ] elif method == 'nhmmer': # Use HMM file rather than '.afna' file. actual_queryfile = get_out_hmm_path(queryfile) #run_command = [method, "-T", hmmer_scorecut, "--cpu", num_threads, # '--tblout', outfile, actual_queryfile, dbfile] run_command = [ method, "-T", hmmer_scorecut, "--cpu", num_threads, '-o', outfile, actual_queryfile, dbfile ] # Prepend program name with directory path if necesssary and specified in # the DataPaths(main_data_dir).py file (this is a work-around for a particular remote # server). #server_program_dirpath = DataPaths(main_data_dir).server_program_dirpath #run_command = [os.path.join(server_program_dirpath, run_command[0])] +\ #run_command[1:] # Run command. subprocess.call(run_command) # Check that the output file is not empty. assert os.path.getsize(outfile) != 0, """Error: Search output file is empty. This may be due to low memory available on this system.\nEmpty file: %s""" % outfile # Return string with command used to run search. #search_descr = method + ' (' + version + ')' + ' run with command:\n\t' +\ # ' '.join(run_command) + '\n' search_descr = ' '.join(run_command) return search_descr