Example #1
0
    def test_ksr_record_select(self):
        import SSfilter.ODBfilter
        test_symbol_list = ['ATP5MC1','CALM1','ATPIF1','CD151']
        tax_subset = ['10090_0','43179_0','9606_0','10116_0','42254_0','9601_0']
        errors_fpath =  "cDNAscreen_041020/summary/errors.tsv"
        ks_tids = ['10090_0','43179_0','9606_0']
        # errors_fpath = 'tmp/'
        display_match_data = False
        display_ksr = True

        tmp_manual_selections = "{0}/manual_record_selections.tsv".format(test_tmp_dir)

        with pd.option_context('display.max_columns',None,'display.max_colwidth',500):
            for symbol in test_symbol_list:
                tsv_inpath = "{0}/ODB/{1}.tsv".format(test_data_dir,symbol)
                unfiltered_tsv = SSfasta.load_tsv_table(tsv_inpath,tax_subset=tax_subset)
                unfiltered_fasta = "{0}/ODB/{1}.fasta".format(test_data_dir,symbol)
                am_idx,exact_matches = ODBfilter.find_alias_matches(symbol,unfiltered_tsv,errors_fpath)
                am_df = unfiltered_tsv.loc[am_idx]
                em_df = ODBfilter.exact_match_df(unfiltered_tsv,exact_matches)
                if display_match_data:
                    print("alias_match")
                    display(am_df)
                    print("exact_match")
                    display(em_df)
                final_ksr_df = ODBfilter.select_known_species_records(symbol,em_df,am_df,ks_tids,unfiltered_fasta,
                                                                      manual_selections_fpath=tmp_manual_selections)
                if display_ksr:
                    print("Final known species records: ")
                    display(final_ksr_df)
                if symbol == 'CD151':
                    #Ensure non alias match/ exact match sequences not present in final ksr
                    self.assertFalse(len(final_ksr_df) == len(ks_tids))
                else:
                    self.assertTrue(len(final_ksr_df) == len(ks_tids))

                #Test manual selections cache
                if symbol == 'CALM1':
                    import contextlib
                    import io
                    print("Repeating CALM1 record selection. Should not ask for input (check this yourself)")

                    out_buf = io.StringIO()
                    print("Checking for cached selection output...")
                    with contextlib.redirect_stdout(out_buf):

                        final_ksr_df = ODBfilter.select_known_species_records(symbol, em_df, am_df, ks_tids,
                                                                              unfiltered_fasta,
                                                                              manual_selections_fpath=tmp_manual_selections)
                        cached_msg = 'To clear selections, either delete corresponding row in file at ' \
                                     '{0}'.format(tmp_manual_selections)
                        self.assertIn(cached_msg,out_buf.getvalue())
                    print("Cached selection output found.")
Example #2
0
 def test_exact_match_df(self):
     from SSfilter.ODBfilter import exact_match_df
     test_input_path = "{0}/ODB/ATP5MC1.tsv".format(test_data_dir)
     tsv_df = SSfasta.load_tsv_table(test_input_path)
     unfiltered_uniques = tsv_df['pub_gene_id'].unique()
     self.assertTrue('ATP5G1;ATP5MC1' in unfiltered_uniques)
     self.assertTrue('ATP5G1' in unfiltered_uniques)
     self.assertTrue('ATP5MC1' in unfiltered_uniques)
     self.assertTrue('9823_0:003c30' in tsv_df.index and 'LOC100519871' in unfiltered_uniques)
     test_em = ['ATP5G1','ATP5MC1']
     exact_matches = exact_match_df(tsv_df,test_em)
     pgid_uniques = exact_matches['pub_gene_id'].unique()
     self.assertTrue('ATP5G1;ATP5MC1' in pgid_uniques)
     self.assertTrue('ATP5G1' in pgid_uniques)
     self.assertTrue('ATP5MC1' in pgid_uniques)
     self.assertFalse('LOC100519871' in pgid_uniques)
Example #3
0
    def test_outgroup_selection(self):
        import SSfilter.ODBfilter
        test_symbol_list = ['ATP5MC1', 'CALM1', 'ATPIF1', 'CD151']
        tax_subset = ['10090_0', '43179_0', '9606_0', '10116_0', '42254_0', '9601_0']
        # symbol = 'ATP5MC1'
        symbol = 'IRF2BP2'
        errors_fpath = "{0}/outgroup_errors.tsv".format(test_tmp_dir)
        ks_tids = ['10090_0', '43179_0', '9606_0']
        tsv_inpath = "{0}/ODB/{1}.tsv".format(test_data_dir,symbol)
        unfiltered_tsv = SSfasta.load_tsv_table(tsv_inpath, tax_subset=tax_subset)
        unfiltered_fasta = "{0}/ODB/{1}.fasta".format(test_data_dir,symbol)
        am_idx, exact_matches = ODBfilter.find_alias_matches(symbol, unfiltered_tsv, errors_fpath)
        am_df = unfiltered_tsv.loc[am_idx]
        em_df = ODBfilter.exact_match_df(unfiltered_tsv, exact_matches)
        final_ksr_df = ODBfilter.select_known_species_records(symbol, em_df, am_df, ks_tids, unfiltered_fasta)

        final_dict = ODBfilter.select_outgrup_records(em_df,am_df,ks_tids,final_ksr_df,unfiltered_fasta)
        final_df = final_dict['final_df']
        assert(len(final_df) == len(tax_subset))
Example #4
0
def process_ODB_input(symbol, config, tax_subset):
    """Return final ODB input record dataframe.

    :param symbol: Gene symbol. Used to find appropriate ODB input files (fasta/ tsv)
    :param config: Contains run info (specifically run_name and ODB test species tax id)
    :param tax_subset: Subset of IDs from species list file, used to limit analyzed sequences to only taxids present in
    tax_subset
    :return (dictionary) results: Contains final_df, em_df, am_df. final_df: Final ODB input record dataframe.
    Contains columns from tsv_files (indexed on int_prot_id OrthoDB internal record IDs), as well as record length
    and sequence information. em_df, am_df as returned by find_alias_matches and exact_match_df

    """
    run_config, odb_config = config['RUN'], config['ODB']
    run_name, test_tid = run_config['RunName'], odb_config['ODBTestTaxID']
    raw_tsv_fpath,raw_fa_fpath = "{0}/input/ODB/{1}.tsv".format(run_name,symbol),\
                                 "{0}/input/ODB/{1}.fasta".format(run_name,symbol)
    seq_qc_fname, error_fname = config['RUN']['QCFileName'], config['RUN'][
        'ErrorsFileName']
    seq_qc_fpath = "{0}/{1}".format(run_name, seq_qc_fname)
    errors_fpath = "{0}/{1}".format(run_name, error_fname)
    manual_selections_fpath = "{0}/manual_record_selections.tsv".format(
        run_name)
    ks_taxids = ['10090_0', '43179_0', '9606_0']
    unfiltered_tsv = SSfasta.load_tsv_table(raw_tsv_fpath,
                                            tax_subset=tax_subset)
    #Filter by alias matches, exact pub_gene_id matches
    try:
        results = {}
        am_ids, exact_matches = find_alias_matches(symbol, unfiltered_tsv,
                                                   errors_fpath)
        am_df = unfiltered_tsv.loc[am_ids]
        em_df = exact_match_df(unfiltered_tsv, exact_matches)
        final_ksr_df = select_known_species_records(
            symbol,
            em_df,
            am_df,
            ks_taxids,
            raw_fa_fpath,
            manual_selections_fpath=manual_selections_fpath)
        final_ksr_df_QC(symbol, exact_matches, final_ksr_df, ks_taxids,
                        test_tid, seq_qc_fpath, raw_fa_fpath)
        final_dict = select_outgrup_records(em_df, am_df, ks_taxids,
                                            final_ksr_df, raw_fa_fpath)
        final_input_df = final_dict['final_df']
        seq_srs, length_srs = SSfasta.length_srs(raw_fa_fpath,
                                                 final_input_df.index)
        final_input_df['length'] = length_srs
        final_input_df['seq'] = seq_srs
        results['final_df'], results['em_df'], results[
            'am_df'] = final_input_df, em_df, am_df
    except SequenceDataError as sde:
        #Log errors, raise error for handling in calling function
        write_errors(errors_fpath, symbol, sde)
        raise sde
    except ValueError as e:
        print("=====")
        print(symbol)
        display(unfiltered_tsv)
        raise e

    return results