def test_length_load(self): test_fpath = "{0}/ODB/{1}.fasta".format(test_data_dir,'ATP5MC1') unfiltered_seqs, unfiltered_lens = SSfasta.length_srs(test_fpath) self.assertTrue(136 in unfiltered_lens.unique()) self.assertTrue(138 in unfiltered_lens.unique()) self.assertTrue('9544_0:0008ab' in unfiltered_lens.index) self.assertTrue(len(unfiltered_lens) == 29) filtered_ids = ['10090_0:0034c4','43179_0:00103c','9606_0:00415a'] filtered_seqs, filtered_lens = SSfasta.length_srs(test_fpath,filtered_ids) self.assertTrue(136 in filtered_lens.unique()) self.assertFalse(138 in filtered_lens.unique()) self.assertFalse('9544_0:0008ab' in filtered_lens.index) self.assertTrue(len(filtered_lens) == 3)
def select_NCBI_record(ODB_fasta_fpath, NCBI_fasta_fpath, taxid_dict, ODB_final_input_df, compare_taxids): """Selects best NCBI record from NCBI fasta fpath by max identity to the OrthoDB records represented by compare_taxids. :param ODB_fasta_fpath: Fasta path for ODB records :param NCBI_fasta_fpath: Fasta path to NCBI records (should only contain records from one species) :param taxid_dict: Maps species names to taxids, used by load_NCBI_fasta_df :param ODB_final_input_df: DataFrame containing accepted records from OrthoDB data, returned from ODBfilter.process_input :param (collection) compare_taxids: tax_ids against which distance should be calculated to determine minimum distance NCBI record :return: combined_df, DataFrame containing rows from ODB_final_input and the minimu, distance row from NCBI_fasta_fpath """ ncbi_df = load_NCBI_fasta_df(NCBI_fasta_fpath, taxid_dict) if len(ncbi_df) > 1: #Align all unfiltered NCBI records against ODB_final_input records combined_unaln_fpath, combined_aln_fpath = "tmp/ODB_NCBI_unaln.fasta", "tmp/ODB_NCBI_aln.fasta" unaln_generator = SSfasta.ODB_NCBI_generator( ODB_fasta_fpath, NCBI_fasta_fpath, odb_subset=ODB_final_input_df.index) SeqIO.write(unaln_generator, combined_unaln_fpath, "fasta") combined_df = ODB_final_input_df.append(ncbi_df, sort=False) # display(combined_df) id_dm, align_srs = SSfasta.construct_id_dm( combined_df, combined_unaln_fpath, align_outpath=combined_aln_fpath) spec_record_ids = ncbi_df.index compare_record_ids = ODB_final_input_df.loc[ ODB_final_input_df['organism_taxid'].isin(compare_taxids)].index md_row, min_dist = min_dist_spec_record(id_dm, align_srs.index, spec_record_ids, compare_record_ids, combined_df) final_combined_df = ODB_final_input_df.append(md_row, sort=False) final_combined_df.index.name = "record_id" return final_combined_df else: combined_df = ODB_final_input_df.append(ncbi_df, sort=False) combined_df.index.name = "record_id" return combined_df
def test_filter_infile(self): from Bio import SeqIO test_fpath = "{0}/ODB/{1}.fasta".format(test_data_dir,'ATP5MC1') ordered_test_ids = ["10090_0:0034c4","43179_0:00103c","9606_0:00415a","10116_0:00386d","42254_0:001ba2", "9986_0:0033f5"] unordered_fpath = "{0}/ATP5MC1_unordered.fasta".format(test_tmp_dir) ordered_fpath = "{0}/ATP5MC1_ordered.fasta".format(test_tmp_dir) SSfasta.filter_fasta_infile(ordered_test_ids,test_fpath,outfile_path=unordered_fpath,ordered=False) SSfasta.filter_fasta_infile(ordered_test_ids, test_fpath, outfile_path=ordered_fpath, ordered=True) unordered = SeqIO.parse(unordered_fpath, 'fasta') ordered = SeqIO.parse(ordered_fpath,'fasta') #Check order unordered_test_ids = [ordered_test_ids[i] for i in [0, 3, 4, 1, 2, 5]] for i,fasta in enumerate(unordered): self.assertTrue(fasta.id == unordered_test_ids[i]) for i,fasta in enumerate(ordered): self.assertTrue(fasta.id == ordered_test_ids[i])
def test_ksr_record_select(self): import SSfilter.ODBfilter test_symbol_list = ['ATP5MC1','CALM1','ATPIF1','CD151'] tax_subset = ['10090_0','43179_0','9606_0','10116_0','42254_0','9601_0'] errors_fpath = "cDNAscreen_041020/summary/errors.tsv" ks_tids = ['10090_0','43179_0','9606_0'] # errors_fpath = 'tmp/' display_match_data = False display_ksr = True tmp_manual_selections = "{0}/manual_record_selections.tsv".format(test_tmp_dir) with pd.option_context('display.max_columns',None,'display.max_colwidth',500): for symbol in test_symbol_list: tsv_inpath = "{0}/ODB/{1}.tsv".format(test_data_dir,symbol) unfiltered_tsv = SSfasta.load_tsv_table(tsv_inpath,tax_subset=tax_subset) unfiltered_fasta = "{0}/ODB/{1}.fasta".format(test_data_dir,symbol) am_idx,exact_matches = ODBfilter.find_alias_matches(symbol,unfiltered_tsv,errors_fpath) am_df = unfiltered_tsv.loc[am_idx] em_df = ODBfilter.exact_match_df(unfiltered_tsv,exact_matches) if display_match_data: print("alias_match") display(am_df) print("exact_match") display(em_df) final_ksr_df = ODBfilter.select_known_species_records(symbol,em_df,am_df,ks_tids,unfiltered_fasta, manual_selections_fpath=tmp_manual_selections) if display_ksr: print("Final known species records: ") display(final_ksr_df) if symbol == 'CD151': #Ensure non alias match/ exact match sequences not present in final ksr self.assertFalse(len(final_ksr_df) == len(ks_tids)) else: self.assertTrue(len(final_ksr_df) == len(ks_tids)) #Test manual selections cache if symbol == 'CALM1': import contextlib import io print("Repeating CALM1 record selection. Should not ask for input (check this yourself)") out_buf = io.StringIO() print("Checking for cached selection output...") with contextlib.redirect_stdout(out_buf): final_ksr_df = ODBfilter.select_known_species_records(symbol, em_df, am_df, ks_tids, unfiltered_fasta, manual_selections_fpath=tmp_manual_selections) cached_msg = 'To clear selections, either delete corresponding row in file at ' \ '{0}'.format(tmp_manual_selections) self.assertIn(cached_msg,out_buf.getvalue()) print("Cached selection output found.")
def final_ksr_df_QC(gene_symbol, matches, final_ksr_df, ks_taxids, ts_taxid, seq_qc_fpath, seq_fpath, length_warning=False): """Log quality checks on final accepted record sequences for species in ks_taxids. Entries which fail QC checks will still be run in analysis, but the quality checks for sequence consistency logged may serve as warnings by which the user can elect to remove results for those gene symbols. The only QC which will prevent analysis is absence of test species tax_id in the final accepted records dataframe. :param gene_symbol: Gene symbol for which data is being used :param matches: List of acceptable exact symbol matches for gene_symbol :param seq_qc_fpath: Log file for quality checks :param seq_fpath: Path to fasta with sequences :param final_ksr_df: DataFrame returned by select_known_species_records :param ks_taxids: list of tax_ids expected to be in final_ksr_df :return: N/A """ if len(final_ksr_df) < len(ks_taxids): if ts_taxid not in final_ksr_df["organism_taxid"].unique(): msg = "No alias matched sequence could be found for test species taxid: {0}".format( ts_taxid) raise SequenceDataError(2, msg) for tax_id in ks_taxids: if tax_id not in final_ksr_df["organism_taxid"].unique(): message_txt = "No reference sequence for tax_id: {0}".format( tax_id) write_ref_seq_QC(seq_qc_fpath, gene_symbol, message_txt) if length_warning: # length_srs = final_ksr_df["length"] length_srs = SSfasta.length_srs(seq_fpath, final_ksr_df.index) median_len = length_srs.median() for record_id in final_ksr_df.index: id_len = length_srs[record_id] if (np.abs(id_len - median_len) / median_len) >= 0.1: message_txt = "Record_id {0} has length {1} which is greater than 10% different from the median ({2})".format( record_id, id_len, median_len) write_ref_seq_QC(seq_qc_fpath, gene_symbol, message_txt) upper_matches = [match.upper() for match in matches] upper_matches = [match + "$|" + match + "[;]" for match in upper_matches] pat = "|".join(upper_matches) for record_id, pgid in final_ksr_df["pub_gene_id"].iteritems(): if not re.search(pat, pgid.upper()): message_txt = "Record_id {0} has pub_gene_id {1} which doesn't match gene_symbol ({2})".format( record_id, pgid, gene_symbol) write_ref_seq_QC(seq_qc_fpath, gene_symbol, message_txt)
def test_exact_match_df(self): from SSfilter.ODBfilter import exact_match_df test_input_path = "{0}/ODB/ATP5MC1.tsv".format(test_data_dir) tsv_df = SSfasta.load_tsv_table(test_input_path) unfiltered_uniques = tsv_df['pub_gene_id'].unique() self.assertTrue('ATP5G1;ATP5MC1' in unfiltered_uniques) self.assertTrue('ATP5G1' in unfiltered_uniques) self.assertTrue('ATP5MC1' in unfiltered_uniques) self.assertTrue('9823_0:003c30' in tsv_df.index and 'LOC100519871' in unfiltered_uniques) test_em = ['ATP5G1','ATP5MC1'] exact_matches = exact_match_df(tsv_df,test_em) pgid_uniques = exact_matches['pub_gene_id'].unique() self.assertTrue('ATP5G1;ATP5MC1' in pgid_uniques) self.assertTrue('ATP5G1' in pgid_uniques) self.assertTrue('ATP5MC1' in pgid_uniques) self.assertFalse('LOC100519871' in pgid_uniques)
def test_outgroup_selection(self): import SSfilter.ODBfilter test_symbol_list = ['ATP5MC1', 'CALM1', 'ATPIF1', 'CD151'] tax_subset = ['10090_0', '43179_0', '9606_0', '10116_0', '42254_0', '9601_0'] # symbol = 'ATP5MC1' symbol = 'IRF2BP2' errors_fpath = "{0}/outgroup_errors.tsv".format(test_tmp_dir) ks_tids = ['10090_0', '43179_0', '9606_0'] tsv_inpath = "{0}/ODB/{1}.tsv".format(test_data_dir,symbol) unfiltered_tsv = SSfasta.load_tsv_table(tsv_inpath, tax_subset=tax_subset) unfiltered_fasta = "{0}/ODB/{1}.fasta".format(test_data_dir,symbol) am_idx, exact_matches = ODBfilter.find_alias_matches(symbol, unfiltered_tsv, errors_fpath) am_df = unfiltered_tsv.loc[am_idx] em_df = ODBfilter.exact_match_df(unfiltered_tsv, exact_matches) final_ksr_df = ODBfilter.select_known_species_records(symbol, em_df, am_df, ks_tids, unfiltered_fasta) final_dict = ODBfilter.select_outgrup_records(em_df,am_df,ks_tids,final_ksr_df,unfiltered_fasta) final_df = final_dict['final_df'] assert(len(final_df) == len(tax_subset))
# os.chdir("..") import sys sys.path.append(os.getcwd()) import unittest from SSutility import SSfasta from IPython.display import display import pandas as pd import numpy as np pd.options.display.max_columns = None from SSanalysis import aas, blosum62_bg, blos_df, sim_matrix #Test sequence data test_outdir = "tests/test_data/output/ATP5MC1/" test_msa = "{0}/ATP5MC1_msa.fasta".format(test_outdir) align_df = SSfasta.align_fasta_to_df(test_msa) ODB_df = align_df.drop(index="XP_026242723.1") test_idx = pd.Index(["43179_0:00103c"]) ncbi_idx = pd.Index(["XP_026242723.1"]) class testCalculations(unittest.TestCase): def test_unique_pos(self): from SSanalysis.SSanalysiscalc import find_uniques #Identify uniques from ODB_df for ATP5MC1 filtered = ODB_df sub_freq = 1 uniques = find_uniques(filtered, sub_freq, test_idx, False) for expr in [ 16 in uniques.columns,
def combined_records_processing(config, am_df, em_df, combined_df, symbol, odb_fasta="", ncbi_fasta="", out_unaln_fasta="", out_aln_fasta="", out_tsv_fpath=""): """Adds in internal record distance information and source DataBase annotations for final dataset. Writes final input dataset (both OrthoDB and NCBI records) to 1) unaligned fasta 2) aligned fasta and 3) a records table corresponding to the record modified record DataFrame :param config: configparser object :param am_df alias_match DataFrame (see ODBfilter) :param em_df: exact match DataFrame :param combined_df: ODB and NCBI combined record DataFrame as returned by select_NCBI_record :param symbol: Gene symbol :param odb_fasta,ncbi_fasta: If provided, will use as sources for records to write final dataset sequences from. If not provided, defaults to unfiltered OrthoDB and NCBI fasta paths given run_name and symbol :param out_unaln_fasta,out_aln_fasta: If provided, will write unaligned/aligned final record Sequence set to these paths. If not provided, uses default output directory path for symbol. :return: processed_df: DataFrame modified to include record distance, db_source, and filter_type """ run_name, ncbi_taxid = config['RUN']['RunName'], config['NCBI'][ 'NCBITaxID'] if not odb_fasta: odb_fasta = "{0}/input/ODB/{1}.fasta".format(run_name, symbol) if not ncbi_fasta: ncbi_fasta = "{0}/input/NCBI/{1}/{2}.fasta".format( run_name, ncbi_taxid, symbol) if not out_unaln_fasta: out_unaln_fasta = "{0}/output/{1}/{1}.fasta".format(run_name, symbol) if not out_aln_fasta: out_aln_fasta = "{0}/output/{1}/{1}_msa.fasta".format(run_name, symbol) if not out_tsv_fpath: out_tsv_fpath = "{0}/output/{1}/{1}_records.tsv".format( run_name, symbol) SSdirectory.create_directory("{0}/output/{1}".format(run_name, symbol)) manual_selections_fpath = "{0}/manual_record_selections.tsv".format( run_name) #Final unaligned and aligned Fasta writing odb_records_idx = combined_df.index[combined_df.index.isin(am_df.index)] ncbi_records_idx = combined_df.index[~combined_df.index.isin(am_df.index)] combined_records = SSfasta.ODB_NCBI_generator(odb_fasta, ncbi_fasta, odb_subset=odb_records_idx, ncbi_subset=ncbi_records_idx, ordered=True) SeqIO.write(combined_records, out_unaln_fasta, 'fasta') #Internal distance calculation id_dm, aln_srs = SSfasta.construct_id_dm(combined_df, out_unaln_fasta, align_outpath=out_aln_fasta) dist_srs = SSfasta.avg_dist_srs(combined_df.index, id_dm) combined_processed = combined_df.copy() combined_processed.loc[:, 'dist'] = dist_srs #Add source_db and selection_type information, drop redundant columns (default is level_taxid, pub_og_id). combined_processed = annotate_source_and_filter(config, symbol, combined_processed, am_df, em_df, manual_selections_fpath) #write records table to file combined_processed.to_csv(out_tsv_fpath, sep='\t') return combined_processed
def annotate_source_and_filter( config, symbol, combined_df, am_df, em_df, manual_selections_fpath="tmp/manual_record_selections.tsv", drop_cols=['pub_og_id', 'level_taxid']): """Populates source_db and filter_type information into combined_df, reorders cols and returns modified DataFrame. :param combined_df: DataFrame containing final record set from OrthoDB and NCBI :param am_df: alias matched OrthoDB records :param em_df: exact symbol matched OrthoDB records :param manual_selections_fpath: tsv file containing records which were manually selected. If file exists at provided path, changes selection_type value in corresponding record row :param (array-like) drop_cols: If provided, drops columns from processed DataFrame. Used currently to filter some redundant OrthoDB tsv information. :return: Returns edited combined_df """ #Leave original dataframe untouched combined_df = combined_df.copy() run_name, ncbi_taxid = config['RUN']['RunName'], config['NCBI'][ 'NCBITaxID'] selection_col = 'selection_type' odb_records_idx = combined_df.index[combined_df.index.isin(am_df.index)] ncbi_records_idx = combined_df.index[~combined_df.index.isin(am_df.index)] combined_df.loc[odb_records_idx, 'db_source'] = "OrthoDB" combined_df.loc[ncbi_records_idx, 'db_source'] = "NCBI" unfiltered_NCBI_fasta = "{0}/input/NCBI/{1}/{2}.fasta".format( run_name, ncbi_taxid, symbol) unf_ncbi_srs = SSfasta.fasta_to_srs(unfiltered_NCBI_fasta) if len(unf_ncbi_srs) > 1: ncbi_filt = "NCBI min dist" else: ncbi_filt = "NCBI single record" combined_df.loc[ncbi_records_idx, selection_col] = ncbi_filt for record_id, row in combined_df.loc[odb_records_idx, :].iterrows(): taxid = row['organism_taxid'] em_taxid_df = em_df.loc[em_df['organism_taxid'] == taxid, :] am_taxid_df = am_df.loc[am_df['organism_taxid'] == taxid, :] if len(em_taxid_df) == 0: if len(am_taxid_df) == 1: combined_df.loc[record_id, selection_col] = "alias match single record" else: combined_df.loc[record_id, selection_col] = "alias match min dist" elif len(em_taxid_df) == 1: combined_df.loc[record_id, selection_col] = "symbol match single record" else: combined_df.loc[record_id, selection_col] = "symbol match min dist" #Read manual selection information, fix selection_type if exists for symbol if os.path.exists(manual_selections_fpath): manual_selections_df = pd.read_csv(manual_selections_fpath, sep='\t', dtype=str, index_col='gene_symbol') if symbol in manual_selections_df.index: record_id = manual_selections_df.loc[symbol, 'record_id'] combined_df.loc[record_id, selection_col] = 'manual selection' #Drop columns from drop_cols if drop_cols: combined_df.drop(columns=drop_cols, inplace=True) #Reorder columns so seq is last. reordered_cols = combined_df.columns.tolist() reorder_labels = ['db_source', 'selection_type', 'length', 'dist', 'seq'] for label in reorder_labels: label_pos = reordered_cols.index(label) reordered_cols.pop(label_pos) reordered_cols.append(label) combined_df = combined_df[reordered_cols] return combined_df
def process_ODB_input(symbol, config, tax_subset): """Return final ODB input record dataframe. :param symbol: Gene symbol. Used to find appropriate ODB input files (fasta/ tsv) :param config: Contains run info (specifically run_name and ODB test species tax id) :param tax_subset: Subset of IDs from species list file, used to limit analyzed sequences to only taxids present in tax_subset :return (dictionary) results: Contains final_df, em_df, am_df. final_df: Final ODB input record dataframe. Contains columns from tsv_files (indexed on int_prot_id OrthoDB internal record IDs), as well as record length and sequence information. em_df, am_df as returned by find_alias_matches and exact_match_df """ run_config, odb_config = config['RUN'], config['ODB'] run_name, test_tid = run_config['RunName'], odb_config['ODBTestTaxID'] raw_tsv_fpath,raw_fa_fpath = "{0}/input/ODB/{1}.tsv".format(run_name,symbol),\ "{0}/input/ODB/{1}.fasta".format(run_name,symbol) seq_qc_fname, error_fname = config['RUN']['QCFileName'], config['RUN'][ 'ErrorsFileName'] seq_qc_fpath = "{0}/{1}".format(run_name, seq_qc_fname) errors_fpath = "{0}/{1}".format(run_name, error_fname) manual_selections_fpath = "{0}/manual_record_selections.tsv".format( run_name) ks_taxids = ['10090_0', '43179_0', '9606_0'] unfiltered_tsv = SSfasta.load_tsv_table(raw_tsv_fpath, tax_subset=tax_subset) #Filter by alias matches, exact pub_gene_id matches try: results = {} am_ids, exact_matches = find_alias_matches(symbol, unfiltered_tsv, errors_fpath) am_df = unfiltered_tsv.loc[am_ids] em_df = exact_match_df(unfiltered_tsv, exact_matches) final_ksr_df = select_known_species_records( symbol, em_df, am_df, ks_taxids, raw_fa_fpath, manual_selections_fpath=manual_selections_fpath) final_ksr_df_QC(symbol, exact_matches, final_ksr_df, ks_taxids, test_tid, seq_qc_fpath, raw_fa_fpath) final_dict = select_outgrup_records(em_df, am_df, ks_taxids, final_ksr_df, raw_fa_fpath) final_input_df = final_dict['final_df'] seq_srs, length_srs = SSfasta.length_srs(raw_fa_fpath, final_input_df.index) final_input_df['length'] = length_srs final_input_df['seq'] = seq_srs results['final_df'], results['em_df'], results[ 'am_df'] = final_input_df, em_df, am_df except SequenceDataError as sde: #Log errors, raise error for handling in calling function write_errors(errors_fpath, symbol, sde) raise sde except ValueError as e: print("=====") print(symbol) display(unfiltered_tsv) raise e return results
def select_outgrup_records(em_df, am_df, ks_taxids, final_ksr_df, seqs_fpath, provide_dist_srs=False, print_skips=False): """Select records for remaining OrthoDB outgroup species in analysis that are not in ks_taxids. Selection is based on maximum identity to accepted records in final_ksr_df (ie accepted human/mouse/13LGS); best (max identity) alias-matched records for a species will still be dropped if they do not meet an identity threshold, currently set to 1.5 * (average identity of sequences in final_lsr_df against each other). :param (DataFrame) em_df: DataFrame of records with exact pub_gene_id match to accepted gene symbols :param (DataFrame) am_df: DataFrame of records matching aliases for symbol :param (array-like) ks_taxids: list of taxonomy ids to use as accepted species records :param final_ksr_df: DataFrame of accepted records from well-annotated species (max one per species). :param seqs_fpath: Fasta file path containing at least all records in am_df. Can be safely set to unfiltered fasta input, records will be automatically filtered down appropriately using am_df. :param (boolean) provide_dist_srs: If true, calculates internal average distances of each record against rest of input records, maps to a Series indexed on record_id, and stores in returned final_dict. :return final_dict: Dictionary mapping 'final_df' to final_df (DataFrame containing all selected OrthoDB records with final_ksr_df records first) and optionally 'dist_srs' to a Series of average distances of each record against rest of input set """ am_non_ksr_taxids = [ tax_id for tax_id in am_df["organism_taxid"].unique() if tax_id not in ks_taxids ] # Distance calculations for final set of known species records - check internal identity values # Set identity threshold - other species sequences above this value will not be included am_dm_fpath = "tmp/am_dm_ka.fasta" am_id_dm, am_align_srs = SSfasta.construct_id_dm(am_df, seqs_fpath, am_dm_fpath) am_record_idx = am_align_srs.index ksr_record_idx = final_ksr_df.index ksr_pos = [ am_record_idx.get_loc(record_id) for record_id in ksr_record_idx ] n_ksr = len(ksr_pos) #ksr_sub_dm: n_ksr x n_ksr distance matrix consisting of values of ksr records against each other ksr_sub_dm = am_id_dm[:, ksr_pos] ksr_sub_dm = ksr_sub_dm[ksr_pos, :] # Ignore diagonal (0 values for record identity against itself); set identity threshold which is used to exclude #dissimilar records non_diagonal_avg = ksr_sub_dm.sum(axis=0) / (n_ksr - 1) identity_threshold = np.mean(non_diagonal_avg) * 1.5 final_df = final_ksr_df.copy() #Add in min dist records for other outgroup species (am_non_ksr_taxids) to final_df if they meet identity threshold for taxid in am_non_ksr_taxids: tax_records = am_record_idx[am_record_idx.str.contains(taxid)] tax_pos = [ am_record_idx.get_loc(record_id) for record_id in tax_records ] md_row, md = min_dist_spec_record(am_id_dm, am_record_idx, tax_records, ksr_record_idx, am_df) if md <= identity_threshold: final_df = final_df.append(md_row) else: if print_skips: print( "Min dist record for tax_id {0} does not meet distance threshold {1}" .format(taxid, identity_threshold)) print("Skipping records for this species.") final_dict = {} final_dict['final_df'] = final_df if provide_dist_srs: #Reorder and filter distmat down to final_df records order, calculate non-diagonal avg distances dm_pos = [ am_record_idx.get_loc(final_idx) for final_idx in final_df.index ] final_ordered_dm = am_id_dm[dm_pos, :] final_ordered_dm = final_ordered_dm[:, dm_pos] dist_srs = SSfasta.avg_dist_srs(final_df.index, final_ordered_dm) final_dict['dist_srs'] = dist_srs return final_dict
def select_known_species_records( gene_symbol, em_df, am_df, ks_taxids, ks_refseqs_fpath, manual_selections_fpath='tmp/manual_record_selections.tsv'): """Return a dataframe of at most one record per species in ks_taxids of representative sequences for species in ks_taxids. ks_taxids will by default be set to include well-annotated species (human/mouse) and the test species from the config file (by default 13LGS). Representative sequences are interpreted as being the only avialable record from a species or the sequence most similar (max identity) to other ks_taxids rep. sequences if a species has multiple records. Sequences which are not present in am_df (ie no alias matches) will not be returned even if present for a species in the unfiltered input. Sequences will be searched for first in em_df but in am_df if no records for that species are in em_df. If em_df or am_df have exactly one record per species, those records will be assumed to be correct and no distance calculations or filtering will be done. Returned DataFrame can possibly have no records present for test_species, in which case the function calling select_known_species_records must check for test_species record presence. :param gene_symbol: String HGNC identifier for gene of input data :param (DataFrame) em_df: DataFrame of exact symbol matched records :param (DataFrame) am_df: DataFrame of alias matched records :param (str) ts_taxid: Taxonomy ID for test species :param (array-like) ks_taxids: Taxonomy IDs for well-annotated species and test species (human/ mouse/ 13LGS) :param ks_refseqs_fpath: Fasta file path for alias-match filtered OrthoDB input :return: """ #Filter em_df and am_df down to taxonomy IDs in ks_taxids ksr_am_df, ksr_em_df = am_df.loc[am_df["organism_taxid"].isin(ks_taxids),:],\ em_df.loc[em_df["organism_taxid"].isin(ks_taxids), :] am_taxid_uniques, em_taxid_uniques = ksr_am_df["organism_taxid"].unique( ), ksr_em_df["organism_taxid"].unique() final_ksr_df = pd.DataFrame(columns=ksr_em_df.columns) #If exactly 3 records from exactly 3 species, reorder in order of ks_taxids and return. Saves extra alignment steps if len(ksr_em_df) == len(ks_taxids) and len(em_taxid_uniques) == len( ks_taxids): for tax_id in ks_taxids: row = ksr_em_df.loc[ksr_em_df["organism_taxid"] == tax_id, :] final_ksr_df = final_ksr_df.append(row) return final_ksr_df elif len(ksr_am_df) == len(ks_taxids) and len(am_taxid_uniques) == len( ks_taxids): for tax_id in ks_taxids: row = ksr_am_df.loc[ksr_am_df["organism_taxid"] == tax_id, :] final_ksr_df = final_ksr_df.append(row) return final_ksr_df #Set selection df to be the smallest available set for which at least one record present from ks_taxids species if ksr_em_df.empty: if ksr_am_df.empty: raise SequenceDataError( 1, "No GeneCards alias matched sequence records for human/mouse/test species" ) else: selection_df = ksr_am_df else: selection_df = ksr_em_df #Populate single_avail_ksr with records if there is only one record in selection_df from that tax_id single_avail_ksr = pd.DataFrame(columns=ksr_em_df.columns) for tax_id in ks_taxids: tax_em_df = selection_df.loc[selection_df['organism_taxid'] == tax_id, :] if len(tax_em_df) == 1: single_avail_ksr = single_avail_ksr.append(tax_em_df) if single_avail_ksr.empty: #If no species have single record, take manual input (or read from cached selections if previously entered), #use selected record as seed input for determining best records from other species. selection_fapath = 'tmp/filtered_selection_intput.fasta' SSfasta.filter_fasta_infile(selection_df.index, ks_refseqs_fpath, selection_fapath) display_df = selection_df.copy().drop( columns=['pub_og_id', 'og_name', 'level_taxid']) display_df.loc[:, 'seq'] = SSfasta.fasta_to_srs(selection_fapath) selection_row = __parse_manual_selection_input( gene_symbol, selection_df, display_df, manual_selections_fpath) single_avail_ksr = single_avail_ksr.append(selection_row) sa_record_ids = single_avail_ksr.index sa_taxid_uniques = single_avail_ksr['organism_taxid'].unique() am_id_dm, am_align_srs = SSfasta.construct_id_dm(ksr_am_df, ks_refseqs_fpath, ordered=False) for ks_id in ks_taxids: if ks_id not in sa_taxid_uniques: #Use em_df or am_df depending on if ks_id is present if ks_id in em_taxid_uniques: spec_record_ids = ksr_em_df.loc[ksr_em_df['organism_taxid'] == ks_id, :].index elif ks_id in ks_id in am_taxid_uniques: spec_record_ids = ksr_am_df.loc[ksr_am_df['organism_taxid'] == ks_id, :].index else: #If no records for taxid in either em or am dfs, skip ksr selection continue # Maximum identity = minimum id_dm value based on AlignIO implementation md_row, min_dist = min_dist_spec_record(am_id_dm, am_align_srs.index, spec_record_ids, sa_record_ids, ksr_am_df) final_ksr_df = final_ksr_df.append(md_row) else: sa_row = single_avail_ksr.loc[single_avail_ksr['organism_taxid'] == ks_id, :] final_ksr_df = final_ksr_df.append(sa_row) return final_ksr_df