def test_ENSP_consistency_of_DB(): """ - ENSPs of taxid_2_protein_table are the superset of ENSPs of protein_2_function_table foreground with functional association also has to be in the precomputed background TaxID_2_Protein_table_STRING: ENSPs expected to be the superset of Protein_2_Function_table_STRING Protein_2_Function_table_STRING Function_2_ENSP_table_STRING """ for taxid in query.get_taxids(): ensp_taxid_2_protein = set(query.get_proteins_of_taxid(taxid)) ensp_protein_2_function = { ele[0] for ele in query.get_results_of_statement( "SELECT protein_2_function.an FROM protein_2_function WHERE protein_2_function.an ~ '^{}\.'" .format(taxid)) } # ensp_function_2_ensp = None len_ensp_taxid_2_protein = len(ensp_taxid_2_protein) len_ensp_protein_2_function = len(ensp_protein_2_function) assert len_ensp_taxid_2_protein >= len_ensp_protein_2_function assert len(ensp_taxid_2_protein.intersection( ensp_protein_2_function)) == len_ensp_protein_2_function assert len(ensp_taxid_2_protein.union( ensp_protein_2_function)) == len_ensp_taxid_2_protein
def random_foreground_background( ): # used TaxIDs fixture previously, but now it is random on TaxID level as well for _ in range(10): taxid = random.choice(query.get_taxids()) # read_from_flat_files=True background = query.get_proteins_of_taxid(taxid) foreground = random.sample(background, 200) return foreground, background, taxid
def test_functional_association_consistency_of_DB(pqo_STRING): """ all functional associations of given taxid and ensp from protein_2_function need be present in function_2_ensp since the ENSPs of the background don't matter for the p-value calculation, but only the lookup of association to number of ENSPs (counts) let's compare the precalculated counts to foreground counts of the proteome (the latter being generated from the protein_2_function_table rather to lookup ENSPs to also check for consistency between ENSPs of protein_2_function_table and taxid_2_protein_table """ taxid_2_etype_2_association_2_count_dict_background = pqo_STRING.taxid_2_etype_2_association_2_count_dict_background for taxid in query.get_taxids(): # grep ENSPs from protein_2_function table (instead of taxid_2_protein_table) --> use as foreground ensp_protein_2_function = { ele[0] for ele in query.get_results_of_statement( "SELECT protein_2_function.an FROM protein_2_function WHERE protein_2_function.an ~ '^{}\.'" .format(taxid)) } etype_2_association_dict = pqo_STRING.get_association_dict_split_by_category( ensp_protein_2_function ) # etype_2_association_dict(key=entity_type(String), val=Dict(key=AN(String), val=SetOfFunctions(String))) # for etype in etype_2_association_dict.keys(): for etype in variables.entity_types_with_data_in_functions_table: association_2_count_dict_background = taxid_2_etype_2_association_2_count_dict_background[ taxid][etype] association_2_count_dict_foreground, association_2_ANs_dict_foreground, foreground_n = ratio.count_terms_v3( ans_set=ensp_protein_2_function, assoc_dict=etype_2_association_dict[etype]) for goterm, ans_set in association_2_ANs_dict_foreground.items(): assert association_2_count_dict_background[goterm] == len( ans_set)
def random_abundance_correction_foreground_background(): for _ in range(10): taxid = random.choice(query.get_taxids()) # read_from_flat_files=True background = query.get_proteins_of_taxid(taxid) foreground = random.sample(background, 200) intensity = [ str(ele) for ele in np.random.normal(size=len(background)) ] return foreground, background, intensity, taxid