Esempio n. 1
0
 def test_length_load(self):
     test_fpath = "{0}/ODB/{1}.fasta".format(test_data_dir,'ATP5MC1')
     unfiltered_seqs, unfiltered_lens = SSfasta.length_srs(test_fpath)
     self.assertTrue(136 in unfiltered_lens.unique())
     self.assertTrue(138 in unfiltered_lens.unique())
     self.assertTrue('9544_0:0008ab' in unfiltered_lens.index)
     self.assertTrue(len(unfiltered_lens) == 29)
     filtered_ids = ['10090_0:0034c4','43179_0:00103c','9606_0:00415a']
     filtered_seqs, filtered_lens = SSfasta.length_srs(test_fpath,filtered_ids)
     self.assertTrue(136 in filtered_lens.unique())
     self.assertFalse(138 in filtered_lens.unique())
     self.assertFalse('9544_0:0008ab' in filtered_lens.index)
     self.assertTrue(len(filtered_lens) == 3)
Esempio n. 2
0
def select_NCBI_record(ODB_fasta_fpath, NCBI_fasta_fpath, taxid_dict,
                       ODB_final_input_df, compare_taxids):
    """Selects best NCBI record from NCBI fasta fpath by max identity to the OrthoDB records represented by compare_taxids.

    :param ODB_fasta_fpath: Fasta path for ODB records
    :param NCBI_fasta_fpath: Fasta path to NCBI records (should only contain records from one species)
    :param taxid_dict: Maps species names to taxids, used by load_NCBI_fasta_df
    :param ODB_final_input_df: DataFrame containing accepted records from OrthoDB data, returned from
    ODBfilter.process_input
    :param (collection) compare_taxids: tax_ids against which distance should be calculated to determine minimum
    distance NCBI record
    :return: combined_df, DataFrame containing rows from ODB_final_input and the minimu, distance row from
    NCBI_fasta_fpath
    """
    ncbi_df = load_NCBI_fasta_df(NCBI_fasta_fpath, taxid_dict)
    if len(ncbi_df) > 1:
        #Align all unfiltered NCBI records against ODB_final_input records
        combined_unaln_fpath, combined_aln_fpath = "tmp/ODB_NCBI_unaln.fasta", "tmp/ODB_NCBI_aln.fasta"
        unaln_generator = SSfasta.ODB_NCBI_generator(
            ODB_fasta_fpath,
            NCBI_fasta_fpath,
            odb_subset=ODB_final_input_df.index)
        SeqIO.write(unaln_generator, combined_unaln_fpath, "fasta")
        combined_df = ODB_final_input_df.append(ncbi_df, sort=False)
        # display(combined_df)
        id_dm, align_srs = SSfasta.construct_id_dm(
            combined_df,
            combined_unaln_fpath,
            align_outpath=combined_aln_fpath)
        spec_record_ids = ncbi_df.index
        compare_record_ids = ODB_final_input_df.loc[
            ODB_final_input_df['organism_taxid'].isin(compare_taxids)].index
        md_row, min_dist = min_dist_spec_record(id_dm, align_srs.index,
                                                spec_record_ids,
                                                compare_record_ids,
                                                combined_df)
        final_combined_df = ODB_final_input_df.append(md_row, sort=False)
        final_combined_df.index.name = "record_id"
        return final_combined_df
    else:
        combined_df = ODB_final_input_df.append(ncbi_df, sort=False)
        combined_df.index.name = "record_id"
        return combined_df
Esempio n. 3
0
    def test_filter_infile(self):
        from Bio import SeqIO
        test_fpath = "{0}/ODB/{1}.fasta".format(test_data_dir,'ATP5MC1')
        ordered_test_ids = ["10090_0:0034c4","43179_0:00103c","9606_0:00415a","10116_0:00386d","42254_0:001ba2",
                            "9986_0:0033f5"]

        unordered_fpath = "{0}/ATP5MC1_unordered.fasta".format(test_tmp_dir)
        ordered_fpath = "{0}/ATP5MC1_ordered.fasta".format(test_tmp_dir)
        SSfasta.filter_fasta_infile(ordered_test_ids,test_fpath,outfile_path=unordered_fpath,ordered=False)
        SSfasta.filter_fasta_infile(ordered_test_ids, test_fpath, outfile_path=ordered_fpath, ordered=True)

        unordered = SeqIO.parse(unordered_fpath, 'fasta')
        ordered = SeqIO.parse(ordered_fpath,'fasta')
        #Check order
        unordered_test_ids = [ordered_test_ids[i] for i in [0, 3, 4, 1, 2, 5]]
        for i,fasta in enumerate(unordered):
            self.assertTrue(fasta.id == unordered_test_ids[i])

        for i,fasta in enumerate(ordered):
            self.assertTrue(fasta.id == ordered_test_ids[i])
Esempio n. 4
0
    def test_ksr_record_select(self):
        import SSfilter.ODBfilter
        test_symbol_list = ['ATP5MC1','CALM1','ATPIF1','CD151']
        tax_subset = ['10090_0','43179_0','9606_0','10116_0','42254_0','9601_0']
        errors_fpath =  "cDNAscreen_041020/summary/errors.tsv"
        ks_tids = ['10090_0','43179_0','9606_0']
        # errors_fpath = 'tmp/'
        display_match_data = False
        display_ksr = True

        tmp_manual_selections = "{0}/manual_record_selections.tsv".format(test_tmp_dir)

        with pd.option_context('display.max_columns',None,'display.max_colwidth',500):
            for symbol in test_symbol_list:
                tsv_inpath = "{0}/ODB/{1}.tsv".format(test_data_dir,symbol)
                unfiltered_tsv = SSfasta.load_tsv_table(tsv_inpath,tax_subset=tax_subset)
                unfiltered_fasta = "{0}/ODB/{1}.fasta".format(test_data_dir,symbol)
                am_idx,exact_matches = ODBfilter.find_alias_matches(symbol,unfiltered_tsv,errors_fpath)
                am_df = unfiltered_tsv.loc[am_idx]
                em_df = ODBfilter.exact_match_df(unfiltered_tsv,exact_matches)
                if display_match_data:
                    print("alias_match")
                    display(am_df)
                    print("exact_match")
                    display(em_df)
                final_ksr_df = ODBfilter.select_known_species_records(symbol,em_df,am_df,ks_tids,unfiltered_fasta,
                                                                      manual_selections_fpath=tmp_manual_selections)
                if display_ksr:
                    print("Final known species records: ")
                    display(final_ksr_df)
                if symbol == 'CD151':
                    #Ensure non alias match/ exact match sequences not present in final ksr
                    self.assertFalse(len(final_ksr_df) == len(ks_tids))
                else:
                    self.assertTrue(len(final_ksr_df) == len(ks_tids))

                #Test manual selections cache
                if symbol == 'CALM1':
                    import contextlib
                    import io
                    print("Repeating CALM1 record selection. Should not ask for input (check this yourself)")

                    out_buf = io.StringIO()
                    print("Checking for cached selection output...")
                    with contextlib.redirect_stdout(out_buf):

                        final_ksr_df = ODBfilter.select_known_species_records(symbol, em_df, am_df, ks_tids,
                                                                              unfiltered_fasta,
                                                                              manual_selections_fpath=tmp_manual_selections)
                        cached_msg = 'To clear selections, either delete corresponding row in file at ' \
                                     '{0}'.format(tmp_manual_selections)
                        self.assertIn(cached_msg,out_buf.getvalue())
                    print("Cached selection output found.")
Esempio n. 5
0
def final_ksr_df_QC(gene_symbol,
                    matches,
                    final_ksr_df,
                    ks_taxids,
                    ts_taxid,
                    seq_qc_fpath,
                    seq_fpath,
                    length_warning=False):
    """Log quality checks on final accepted record sequences for species in ks_taxids.

    Entries which fail QC checks will still be run in analysis, but the quality checks for sequence consistency logged
    may serve as warnings by which the user can elect to remove results for those gene symbols. The only QC which will
    prevent analysis is absence of test species tax_id in the final accepted records dataframe.

    :param gene_symbol: Gene symbol for which data is being used
    :param matches: List of acceptable exact symbol matches for gene_symbol
    :param seq_qc_fpath: Log file for quality checks
    :param seq_fpath: Path to fasta with sequences
    :param final_ksr_df: DataFrame returned by select_known_species_records
    :param ks_taxids: list of tax_ids expected to be in final_ksr_df
    :return: N/A
    """
    if len(final_ksr_df) < len(ks_taxids):
        if ts_taxid not in final_ksr_df["organism_taxid"].unique():
            msg = "No alias matched sequence could be found for test species taxid: {0}".format(
                ts_taxid)
            raise SequenceDataError(2, msg)
        for tax_id in ks_taxids:
            if tax_id not in final_ksr_df["organism_taxid"].unique():
                message_txt = "No reference sequence for tax_id: {0}".format(
                    tax_id)
                write_ref_seq_QC(seq_qc_fpath, gene_symbol, message_txt)
    if length_warning:
        # length_srs = final_ksr_df["length"]
        length_srs = SSfasta.length_srs(seq_fpath, final_ksr_df.index)
        median_len = length_srs.median()
        for record_id in final_ksr_df.index:
            id_len = length_srs[record_id]
            if (np.abs(id_len - median_len) / median_len) >= 0.1:
                message_txt = "Record_id {0} has length {1} which is greater than 10% different from the median ({2})".format(
                    record_id, id_len, median_len)
                write_ref_seq_QC(seq_qc_fpath, gene_symbol, message_txt)
    upper_matches = [match.upper() for match in matches]
    upper_matches = [match + "$|" + match + "[;]" for match in upper_matches]
    pat = "|".join(upper_matches)
    for record_id, pgid in final_ksr_df["pub_gene_id"].iteritems():
        if not re.search(pat, pgid.upper()):
            message_txt = "Record_id {0} has pub_gene_id {1} which doesn't match gene_symbol ({2})".format(
                record_id, pgid, gene_symbol)
            write_ref_seq_QC(seq_qc_fpath, gene_symbol, message_txt)
Esempio n. 6
0
 def test_exact_match_df(self):
     from SSfilter.ODBfilter import exact_match_df
     test_input_path = "{0}/ODB/ATP5MC1.tsv".format(test_data_dir)
     tsv_df = SSfasta.load_tsv_table(test_input_path)
     unfiltered_uniques = tsv_df['pub_gene_id'].unique()
     self.assertTrue('ATP5G1;ATP5MC1' in unfiltered_uniques)
     self.assertTrue('ATP5G1' in unfiltered_uniques)
     self.assertTrue('ATP5MC1' in unfiltered_uniques)
     self.assertTrue('9823_0:003c30' in tsv_df.index and 'LOC100519871' in unfiltered_uniques)
     test_em = ['ATP5G1','ATP5MC1']
     exact_matches = exact_match_df(tsv_df,test_em)
     pgid_uniques = exact_matches['pub_gene_id'].unique()
     self.assertTrue('ATP5G1;ATP5MC1' in pgid_uniques)
     self.assertTrue('ATP5G1' in pgid_uniques)
     self.assertTrue('ATP5MC1' in pgid_uniques)
     self.assertFalse('LOC100519871' in pgid_uniques)
Esempio n. 7
0
    def test_outgroup_selection(self):
        import SSfilter.ODBfilter
        test_symbol_list = ['ATP5MC1', 'CALM1', 'ATPIF1', 'CD151']
        tax_subset = ['10090_0', '43179_0', '9606_0', '10116_0', '42254_0', '9601_0']
        # symbol = 'ATP5MC1'
        symbol = 'IRF2BP2'
        errors_fpath = "{0}/outgroup_errors.tsv".format(test_tmp_dir)
        ks_tids = ['10090_0', '43179_0', '9606_0']
        tsv_inpath = "{0}/ODB/{1}.tsv".format(test_data_dir,symbol)
        unfiltered_tsv = SSfasta.load_tsv_table(tsv_inpath, tax_subset=tax_subset)
        unfiltered_fasta = "{0}/ODB/{1}.fasta".format(test_data_dir,symbol)
        am_idx, exact_matches = ODBfilter.find_alias_matches(symbol, unfiltered_tsv, errors_fpath)
        am_df = unfiltered_tsv.loc[am_idx]
        em_df = ODBfilter.exact_match_df(unfiltered_tsv, exact_matches)
        final_ksr_df = ODBfilter.select_known_species_records(symbol, em_df, am_df, ks_tids, unfiltered_fasta)

        final_dict = ODBfilter.select_outgrup_records(em_df,am_df,ks_tids,final_ksr_df,unfiltered_fasta)
        final_df = final_dict['final_df']
        assert(len(final_df) == len(tax_subset))
Esempio n. 8
0
# os.chdir("..")
import sys
sys.path.append(os.getcwd())
import unittest
from SSutility import SSfasta
from IPython.display import display
import pandas as pd
import numpy as np
pd.options.display.max_columns = None

from SSanalysis import aas, blosum62_bg, blos_df, sim_matrix

#Test sequence data
test_outdir = "tests/test_data/output/ATP5MC1/"
test_msa = "{0}/ATP5MC1_msa.fasta".format(test_outdir)
align_df = SSfasta.align_fasta_to_df(test_msa)
ODB_df = align_df.drop(index="XP_026242723.1")
test_idx = pd.Index(["43179_0:00103c"])
ncbi_idx = pd.Index(["XP_026242723.1"])


class testCalculations(unittest.TestCase):
    def test_unique_pos(self):
        from SSanalysis.SSanalysiscalc import find_uniques
        #Identify uniques from ODB_df for ATP5MC1
        filtered = ODB_df
        sub_freq = 1

        uniques = find_uniques(filtered, sub_freq, test_idx, False)
        for expr in [
                16 in uniques.columns,
Esempio n. 9
0
def combined_records_processing(config,
                                am_df,
                                em_df,
                                combined_df,
                                symbol,
                                odb_fasta="",
                                ncbi_fasta="",
                                out_unaln_fasta="",
                                out_aln_fasta="",
                                out_tsv_fpath=""):
    """Adds in internal record distance information and source DataBase annotations for final dataset. Writes final
    input dataset (both OrthoDB and NCBI records) to 1) unaligned fasta 2) aligned fasta and 3) a records table
    corresponding to the record modified record DataFrame

    :param config: configparser object
    :param am_df alias_match DataFrame (see ODBfilter)
    :param em_df: exact match DataFrame
    :param combined_df: ODB and NCBI combined record DataFrame as returned by select_NCBI_record
    :param symbol: Gene symbol
    :param odb_fasta,ncbi_fasta: If provided, will use as sources for records to write final dataset sequences from. If
    not provided, defaults to unfiltered OrthoDB and NCBI fasta paths given run_name and symbol
    :param out_unaln_fasta,out_aln_fasta: If provided, will write unaligned/aligned final record Sequence set to
    these paths. If not provided, uses default output directory path for symbol.
    :return: processed_df: DataFrame modified to include record distance, db_source, and filter_type
    """
    run_name, ncbi_taxid = config['RUN']['RunName'], config['NCBI'][
        'NCBITaxID']
    if not odb_fasta:
        odb_fasta = "{0}/input/ODB/{1}.fasta".format(run_name, symbol)
    if not ncbi_fasta:
        ncbi_fasta = "{0}/input/NCBI/{1}/{2}.fasta".format(
            run_name, ncbi_taxid, symbol)
    if not out_unaln_fasta:
        out_unaln_fasta = "{0}/output/{1}/{1}.fasta".format(run_name, symbol)
    if not out_aln_fasta:
        out_aln_fasta = "{0}/output/{1}/{1}_msa.fasta".format(run_name, symbol)
    if not out_tsv_fpath:
        out_tsv_fpath = "{0}/output/{1}/{1}_records.tsv".format(
            run_name, symbol)

    SSdirectory.create_directory("{0}/output/{1}".format(run_name, symbol))
    manual_selections_fpath = "{0}/manual_record_selections.tsv".format(
        run_name)

    #Final unaligned and aligned Fasta writing
    odb_records_idx = combined_df.index[combined_df.index.isin(am_df.index)]
    ncbi_records_idx = combined_df.index[~combined_df.index.isin(am_df.index)]
    combined_records = SSfasta.ODB_NCBI_generator(odb_fasta,
                                                  ncbi_fasta,
                                                  odb_subset=odb_records_idx,
                                                  ncbi_subset=ncbi_records_idx,
                                                  ordered=True)
    SeqIO.write(combined_records, out_unaln_fasta, 'fasta')
    #Internal distance calculation
    id_dm, aln_srs = SSfasta.construct_id_dm(combined_df,
                                             out_unaln_fasta,
                                             align_outpath=out_aln_fasta)
    dist_srs = SSfasta.avg_dist_srs(combined_df.index, id_dm)
    combined_processed = combined_df.copy()
    combined_processed.loc[:, 'dist'] = dist_srs
    #Add source_db and selection_type information, drop redundant columns (default is level_taxid, pub_og_id).
    combined_processed = annotate_source_and_filter(config, symbol,
                                                    combined_processed, am_df,
                                                    em_df,
                                                    manual_selections_fpath)
    #write records table to file
    combined_processed.to_csv(out_tsv_fpath, sep='\t')
    return combined_processed
Esempio n. 10
0
def annotate_source_and_filter(
        config,
        symbol,
        combined_df,
        am_df,
        em_df,
        manual_selections_fpath="tmp/manual_record_selections.tsv",
        drop_cols=['pub_og_id', 'level_taxid']):
    """Populates source_db and filter_type information into combined_df, reorders cols and returns modified DataFrame.

    :param combined_df: DataFrame containing final record set from OrthoDB and NCBI
    :param am_df: alias matched OrthoDB records
    :param em_df: exact symbol matched OrthoDB records
    :param manual_selections_fpath: tsv file containing records which were manually selected. If file exists at provided
    path, changes selection_type value in corresponding record row
    :param (array-like) drop_cols: If provided, drops columns from processed DataFrame. Used currently to filter
    some redundant OrthoDB tsv information.
    :return: Returns edited combined_df
    """
    #Leave original dataframe untouched
    combined_df = combined_df.copy()
    run_name, ncbi_taxid = config['RUN']['RunName'], config['NCBI'][
        'NCBITaxID']
    selection_col = 'selection_type'

    odb_records_idx = combined_df.index[combined_df.index.isin(am_df.index)]
    ncbi_records_idx = combined_df.index[~combined_df.index.isin(am_df.index)]
    combined_df.loc[odb_records_idx, 'db_source'] = "OrthoDB"
    combined_df.loc[ncbi_records_idx, 'db_source'] = "NCBI"

    unfiltered_NCBI_fasta = "{0}/input/NCBI/{1}/{2}.fasta".format(
        run_name, ncbi_taxid, symbol)
    unf_ncbi_srs = SSfasta.fasta_to_srs(unfiltered_NCBI_fasta)
    if len(unf_ncbi_srs) > 1:
        ncbi_filt = "NCBI min dist"
    else:
        ncbi_filt = "NCBI single record"
    combined_df.loc[ncbi_records_idx, selection_col] = ncbi_filt

    for record_id, row in combined_df.loc[odb_records_idx, :].iterrows():
        taxid = row['organism_taxid']
        em_taxid_df = em_df.loc[em_df['organism_taxid'] == taxid, :]
        am_taxid_df = am_df.loc[am_df['organism_taxid'] == taxid, :]
        if len(em_taxid_df) == 0:
            if len(am_taxid_df) == 1:
                combined_df.loc[record_id,
                                selection_col] = "alias match single record"
            else:
                combined_df.loc[record_id,
                                selection_col] = "alias match min dist"
        elif len(em_taxid_df) == 1:
            combined_df.loc[record_id,
                            selection_col] = "symbol match single record"
        else:
            combined_df.loc[record_id, selection_col] = "symbol match min dist"
    #Read manual selection information, fix selection_type if exists for symbol
    if os.path.exists(manual_selections_fpath):
        manual_selections_df = pd.read_csv(manual_selections_fpath,
                                           sep='\t',
                                           dtype=str,
                                           index_col='gene_symbol')
        if symbol in manual_selections_df.index:
            record_id = manual_selections_df.loc[symbol, 'record_id']
            combined_df.loc[record_id, selection_col] = 'manual selection'
    #Drop columns from drop_cols
    if drop_cols:
        combined_df.drop(columns=drop_cols, inplace=True)
    #Reorder columns so seq is last.
    reordered_cols = combined_df.columns.tolist()
    reorder_labels = ['db_source', 'selection_type', 'length', 'dist', 'seq']
    for label in reorder_labels:
        label_pos = reordered_cols.index(label)
        reordered_cols.pop(label_pos)
        reordered_cols.append(label)
    combined_df = combined_df[reordered_cols]
    return combined_df
Esempio n. 11
0
def process_ODB_input(symbol, config, tax_subset):
    """Return final ODB input record dataframe.

    :param symbol: Gene symbol. Used to find appropriate ODB input files (fasta/ tsv)
    :param config: Contains run info (specifically run_name and ODB test species tax id)
    :param tax_subset: Subset of IDs from species list file, used to limit analyzed sequences to only taxids present in
    tax_subset
    :return (dictionary) results: Contains final_df, em_df, am_df. final_df: Final ODB input record dataframe.
    Contains columns from tsv_files (indexed on int_prot_id OrthoDB internal record IDs), as well as record length
    and sequence information. em_df, am_df as returned by find_alias_matches and exact_match_df

    """
    run_config, odb_config = config['RUN'], config['ODB']
    run_name, test_tid = run_config['RunName'], odb_config['ODBTestTaxID']
    raw_tsv_fpath,raw_fa_fpath = "{0}/input/ODB/{1}.tsv".format(run_name,symbol),\
                                 "{0}/input/ODB/{1}.fasta".format(run_name,symbol)
    seq_qc_fname, error_fname = config['RUN']['QCFileName'], config['RUN'][
        'ErrorsFileName']
    seq_qc_fpath = "{0}/{1}".format(run_name, seq_qc_fname)
    errors_fpath = "{0}/{1}".format(run_name, error_fname)
    manual_selections_fpath = "{0}/manual_record_selections.tsv".format(
        run_name)
    ks_taxids = ['10090_0', '43179_0', '9606_0']
    unfiltered_tsv = SSfasta.load_tsv_table(raw_tsv_fpath,
                                            tax_subset=tax_subset)
    #Filter by alias matches, exact pub_gene_id matches
    try:
        results = {}
        am_ids, exact_matches = find_alias_matches(symbol, unfiltered_tsv,
                                                   errors_fpath)
        am_df = unfiltered_tsv.loc[am_ids]
        em_df = exact_match_df(unfiltered_tsv, exact_matches)
        final_ksr_df = select_known_species_records(
            symbol,
            em_df,
            am_df,
            ks_taxids,
            raw_fa_fpath,
            manual_selections_fpath=manual_selections_fpath)
        final_ksr_df_QC(symbol, exact_matches, final_ksr_df, ks_taxids,
                        test_tid, seq_qc_fpath, raw_fa_fpath)
        final_dict = select_outgrup_records(em_df, am_df, ks_taxids,
                                            final_ksr_df, raw_fa_fpath)
        final_input_df = final_dict['final_df']
        seq_srs, length_srs = SSfasta.length_srs(raw_fa_fpath,
                                                 final_input_df.index)
        final_input_df['length'] = length_srs
        final_input_df['seq'] = seq_srs
        results['final_df'], results['em_df'], results[
            'am_df'] = final_input_df, em_df, am_df
    except SequenceDataError as sde:
        #Log errors, raise error for handling in calling function
        write_errors(errors_fpath, symbol, sde)
        raise sde
    except ValueError as e:
        print("=====")
        print(symbol)
        display(unfiltered_tsv)
        raise e

    return results
Esempio n. 12
0
def select_outgrup_records(em_df,
                           am_df,
                           ks_taxids,
                           final_ksr_df,
                           seqs_fpath,
                           provide_dist_srs=False,
                           print_skips=False):
    """Select records for remaining OrthoDB outgroup species in analysis that are not in ks_taxids.

    Selection is based on maximum identity to accepted records in final_ksr_df (ie accepted human/mouse/13LGS); best
    (max identity) alias-matched records for a species will still be dropped if they do not meet an identity threshold,
    currently set to 1.5 * (average identity of sequences in final_lsr_df against each other).

    :param (DataFrame) em_df: DataFrame of records with exact pub_gene_id match to accepted gene symbols
    :param (DataFrame) am_df: DataFrame of records matching aliases for symbol
    :param (array-like) ks_taxids: list of taxonomy ids to use as accepted species records
    :param final_ksr_df: DataFrame of accepted records from well-annotated species (max one per species).
    :param seqs_fpath: Fasta file path containing at least all records in am_df. Can be safely set to unfiltered
    fasta input, records will be automatically filtered down appropriately using am_df.
    :param (boolean) provide_dist_srs: If true, calculates internal average distances of each record against rest of
    input records, maps to a Series indexed on record_id, and stores in returned final_dict.
    :return final_dict: Dictionary mapping 'final_df' to final_df (DataFrame containing all selected OrthoDB records
    with final_ksr_df records first) and optionally 'dist_srs' to a Series of average distances of each record
    against rest of input set
    """
    am_non_ksr_taxids = [
        tax_id for tax_id in am_df["organism_taxid"].unique()
        if tax_id not in ks_taxids
    ]

    # Distance calculations for final set of known species records - check internal identity values
    # Set identity threshold - other species sequences above this value will not be included
    am_dm_fpath = "tmp/am_dm_ka.fasta"
    am_id_dm, am_align_srs = SSfasta.construct_id_dm(am_df, seqs_fpath,
                                                     am_dm_fpath)
    am_record_idx = am_align_srs.index
    ksr_record_idx = final_ksr_df.index
    ksr_pos = [
        am_record_idx.get_loc(record_id) for record_id in ksr_record_idx
    ]
    n_ksr = len(ksr_pos)
    #ksr_sub_dm: n_ksr x n_ksr distance matrix consisting of values of ksr records against each other
    ksr_sub_dm = am_id_dm[:, ksr_pos]
    ksr_sub_dm = ksr_sub_dm[ksr_pos, :]
    # Ignore diagonal (0 values for record identity against itself); set identity threshold which is used to exclude
    #dissimilar records
    non_diagonal_avg = ksr_sub_dm.sum(axis=0) / (n_ksr - 1)
    identity_threshold = np.mean(non_diagonal_avg) * 1.5
    final_df = final_ksr_df.copy()
    #Add in min dist records for other outgroup species (am_non_ksr_taxids) to final_df if they meet identity threshold
    for taxid in am_non_ksr_taxids:
        tax_records = am_record_idx[am_record_idx.str.contains(taxid)]
        tax_pos = [
            am_record_idx.get_loc(record_id) for record_id in tax_records
        ]
        md_row, md = min_dist_spec_record(am_id_dm, am_record_idx, tax_records,
                                          ksr_record_idx, am_df)
        if md <= identity_threshold:
            final_df = final_df.append(md_row)
        else:
            if print_skips:
                print(
                    "Min dist record for tax_id {0} does not meet distance threshold {1}"
                    .format(taxid, identity_threshold))
                print("Skipping records for this species.")

    final_dict = {}
    final_dict['final_df'] = final_df
    if provide_dist_srs:
        #Reorder and filter distmat down to final_df records order, calculate non-diagonal avg distances
        dm_pos = [
            am_record_idx.get_loc(final_idx) for final_idx in final_df.index
        ]
        final_ordered_dm = am_id_dm[dm_pos, :]
        final_ordered_dm = final_ordered_dm[:, dm_pos]
        dist_srs = SSfasta.avg_dist_srs(final_df.index, final_ordered_dm)
        final_dict['dist_srs'] = dist_srs
    return final_dict
Esempio n. 13
0
def select_known_species_records(
        gene_symbol,
        em_df,
        am_df,
        ks_taxids,
        ks_refseqs_fpath,
        manual_selections_fpath='tmp/manual_record_selections.tsv'):
    """Return a dataframe of at most one record per species in ks_taxids of representative sequences for species in
    ks_taxids. ks_taxids will by default be set to include well-annotated species (human/mouse) and the test species
    from the config file (by default 13LGS).
    Representative sequences are interpreted as being the only avialable record from a species or the sequence
    most similar (max identity) to other ks_taxids rep. sequences if a species has multiple records. Sequences which are
    not present in am_df (ie no alias matches) will not be returned even if present for a species in the unfiltered input.
    Sequences will be searched for first in em_df but in am_df if no records for that species are in em_df. If em_df or
    am_df have exactly one record per species, those records will be assumed to be correct and no distance calculations
    or filtering will be done. Returned DataFrame can possibly have no records present for test_species, in which case
    the function calling select_known_species_records must check for test_species record presence.

    :param gene_symbol: String HGNC identifier for gene of input data
    :param (DataFrame) em_df: DataFrame of exact symbol matched records
    :param (DataFrame) am_df: DataFrame of alias matched records
    :param (str) ts_taxid: Taxonomy ID for test species
    :param (array-like) ks_taxids: Taxonomy IDs for well-annotated species and test species (human/ mouse/ 13LGS)
    :param ks_refseqs_fpath: Fasta file path for alias-match filtered OrthoDB input
    :return:
    """
    #Filter em_df and am_df down to taxonomy IDs in ks_taxids
    ksr_am_df, ksr_em_df = am_df.loc[am_df["organism_taxid"].isin(ks_taxids),:],\
                           em_df.loc[em_df["organism_taxid"].isin(ks_taxids), :]
    am_taxid_uniques, em_taxid_uniques = ksr_am_df["organism_taxid"].unique(
    ), ksr_em_df["organism_taxid"].unique()
    final_ksr_df = pd.DataFrame(columns=ksr_em_df.columns)
    #If exactly 3 records from exactly 3 species, reorder in order of ks_taxids and return. Saves extra alignment steps
    if len(ksr_em_df) == len(ks_taxids) and len(em_taxid_uniques) == len(
            ks_taxids):
        for tax_id in ks_taxids:
            row = ksr_em_df.loc[ksr_em_df["organism_taxid"] == tax_id, :]
            final_ksr_df = final_ksr_df.append(row)
        return final_ksr_df
    elif len(ksr_am_df) == len(ks_taxids) and len(am_taxid_uniques) == len(
            ks_taxids):
        for tax_id in ks_taxids:
            row = ksr_am_df.loc[ksr_am_df["organism_taxid"] == tax_id, :]
            final_ksr_df = final_ksr_df.append(row)
        return final_ksr_df
    #Set selection df to be the smallest available set for which at least one record present from ks_taxids species
    if ksr_em_df.empty:
        if ksr_am_df.empty:
            raise SequenceDataError(
                1,
                "No GeneCards alias matched sequence records for human/mouse/test species"
            )
        else:
            selection_df = ksr_am_df
    else:
        selection_df = ksr_em_df
    #Populate single_avail_ksr with records if there is only one record in selection_df from that tax_id
    single_avail_ksr = pd.DataFrame(columns=ksr_em_df.columns)
    for tax_id in ks_taxids:
        tax_em_df = selection_df.loc[selection_df['organism_taxid'] ==
                                     tax_id, :]
        if len(tax_em_df) == 1:
            single_avail_ksr = single_avail_ksr.append(tax_em_df)
    if single_avail_ksr.empty:
        #If no species have single record, take manual input (or read from cached selections if previously entered),
        #use selected record as seed input for determining best records from other species.
        selection_fapath = 'tmp/filtered_selection_intput.fasta'
        SSfasta.filter_fasta_infile(selection_df.index, ks_refseqs_fpath,
                                    selection_fapath)
        display_df = selection_df.copy().drop(
            columns=['pub_og_id', 'og_name', 'level_taxid'])
        display_df.loc[:, 'seq'] = SSfasta.fasta_to_srs(selection_fapath)
        selection_row = __parse_manual_selection_input(
            gene_symbol, selection_df, display_df, manual_selections_fpath)
        single_avail_ksr = single_avail_ksr.append(selection_row)
    sa_record_ids = single_avail_ksr.index
    sa_taxid_uniques = single_avail_ksr['organism_taxid'].unique()

    am_id_dm, am_align_srs = SSfasta.construct_id_dm(ksr_am_df,
                                                     ks_refseqs_fpath,
                                                     ordered=False)

    for ks_id in ks_taxids:
        if ks_id not in sa_taxid_uniques:
            #Use em_df or am_df depending on if ks_id is present
            if ks_id in em_taxid_uniques:
                spec_record_ids = ksr_em_df.loc[ksr_em_df['organism_taxid'] ==
                                                ks_id, :].index
            elif ks_id in ks_id in am_taxid_uniques:
                spec_record_ids = ksr_am_df.loc[ksr_am_df['organism_taxid'] ==
                                                ks_id, :].index
            else:
                #If no records for taxid in either em or am dfs, skip ksr selection
                continue
            # Maximum identity = minimum id_dm value based on AlignIO implementation
            md_row, min_dist = min_dist_spec_record(am_id_dm,
                                                    am_align_srs.index,
                                                    spec_record_ids,
                                                    sa_record_ids, ksr_am_df)
            final_ksr_df = final_ksr_df.append(md_row)
        else:
            sa_row = single_avail_ksr.loc[single_avail_ksr['organism_taxid'] ==
                                          ks_id, :]
            final_ksr_df = final_ksr_df.append(sa_row)
    return final_ksr_df