def get_raw_references(datapath, phenotype_set): """Parse a phenotype file and collect descriptions and raw phenotypes raw phenotypes are phenotypes in the original ontology Args: datapath path to phenotab file phenotype_set set of acceptable phenotypes Returns: two objects - dict mapping reference codes to reference descriptions and phenotypes - set of phenotypes that could not be mapped """ badphenotypes = set() references = dict() with open_file(datapath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="\"") for row in reader: if not valid_reference_id(row["Reference"]): continue phenotype = row["Phenotype"] if phenotype not in phenotype_set: badphenotypes.add(phenotype) continue rowval = tofreq[row["Frequency"]] id = row["Source"] + ":" + row["Disease_number"] if id not in references: references[id] = Representation(name=id) references[id].title = row["Disease_title"] references[id].set(phenotype, rowval) return references, badphenotypes
def test_impute_fromseeds_highfirst(self): """imputing values from manually-specified seeds.""" rr = Representation(dict()) # specify data for two children, DOID:4 is higher in tree, so should gain rr.set("DOID:0014667", 0.4) rr.set("DOID:0080015", 0.3) rr.impute(self.obo, self.obodef, seeds=["DOID:0014667", "DOID:0080015"]) self.assertAlmostEqual(rr.get("DOID:0014667"), 0.4, msg="should remain") self.assertAlmostEqual(rr.get("DOID:0080015"), 0.3, msg="should remain") self.assertAlmostEqual( rr.get("DOID:4"), 1 - ((1 - 0.4) * (1 - 0.3) * (1 - 0.2)), msg="ancestor gains from two children (and prior)") self.assertAlmostEqual(rr.get("DOID:655"), 0.2, msg="remain; new DOID:4")
def setUpClass(cls): """For setup, ensure db does not exist.""" config = CompleteTestConfig() config.null_prior = 0.2 cls.dbfile = config.db cls.pipeline = Phenoscoring(config) cls.pipeline.build() obopath = check_file(config.obo, config.db, "obo") cls.obo = MinimalObo(obopath, True) # a dummy set of default values cls.obodefaults = dict.fromkeys(cls.obo.ids(), 0.2) cls.obozeros = dict.fromkeys(cls.obo.ids(), 0) cls.ref_priors = get_ref_priors(config.db) cls.rs, cls.rs2 = get_refsets(config.db, ref_priors=cls.ref_priors) cls.rs.learn_obo(cls.obo) cls.rs2.learn_obo(cls.obo) # for testing individual configurations cls.y3model = Representation(name="Y3").set("Y:003", 0.8) cls.refA = Representation(name="refA").set("Y:002", 1) cls.refA.defaults(cls.obozeros) cls.refB = Representation(name="refB").set("Y:002", 1) cls.refB.defaults(cls.obozeros)
def setUpClass(cls): # create with sibling diseases Y:002 and Y:001 are siblings # set some phenotype priors that are nonzero cls.priors = dict() cls.priors["Y:004"] = 0.66 cls.priors["Y:005"] = cls.priors["Y:006"] = 0.25 cls.priors["Y:003"] = 0.66 cls.priors["Y:001"] = cls.priors["Y:002"] = 0.33 cls.priors["Y:007"] = cls.priors["Y:008"] = 0.25 cls.refnull = Representation(name="null") # refA has a negative phenotype cls.refA = Representation(name="refA") cls.refA.set("Y:002", 0.1).impute(Yobo, cls.priors) # refB has a negative and positive phenotypes cls.refB = Representation(name="refB") cls.refB.set("Y:001", 0.01).set("Y:006", 0.8).impute(Yobo, cls.priors) # refB2 has a weaker positive phenotype cls.refB2 = Representation(name="refB2") cls.refB2.set("Y:001", 0.1).set("Y:006", 0.5).impute(Yobo, cls.priors) cls.rs = ReferenceSet(dict(null=0.4, refA=0.3, refB=0.3, refB2=0.3), ids=Yobo.ids(), row_priors=cls.priors) cls.rs.add(cls.refnull).add(cls.refA).add(cls.refB).add(cls.refB2) cls.rs.learn_obo(Yobo)
def setUpClass(cls): # set some phenotype priors that are nonzero cls.priors = dict() cls.priors["Y:004"] = 0.66 cls.priors["Y:005"] = cls.priors["Y:006"] = 0.25 cls.priors["Y:003"] = 0.66 cls.priors["Y:001"] = cls.priors["Y:002"] = 0.33 cls.priors["Y:007"] = cls.priors["Y:008"] = 0.25 # create reference set with some strong phenotypes cls.refnull = Representation(name="null") cls.refA = Representation(name="refA") cls.refB = Representation(name="refB") cls.refA.set("Y:002", 1).impute(Yobo, cls.priors) cls.refB = Representation(name="refB") cls.refB.set("Y:001", 1).impute(Yobo, cls.priors) # reset missing phenotypes to smaller-than-prior for k, v in cls.priors.items(): if cls.refA.get(k) == v: cls.refA.set(k, v / 2) if cls.refB.get(k) == v: cls.refB.set(k, v / 2) cls.rs = ReferenceSet(dict(null=0.3, refA=0.3, refB=0.3), ids=Yobo.ids(), row_priors=cls.priors) cls.rs.add(cls.refnull).add(cls.refA).add(cls.refB) cls.rs.learn_obo(Yobo)
def test_bg_nonverbose(self): """compute with background value leaves score the same.""" neutral = Representation(name="TP") neutral.set("Y:004", self.priors["Y:004"]) chain = self.rs.inference_chain(neutral, "refA", verbose=False) chain.evaluate() self.assertEqual(chain.posterior, chain.prior)
def get_complete_null(dbpath): """create a complete representation for the null reference""" result = Representation(name="null") tab = ReferenceCompletePhenotypeTable(dbpath) generator = DBGenerator(tab, where=dict(id="null")) for row in generator.next(): result.set(row["phenotype"], row["value"]) return result
def test_FP_same_parent(self): """various FPs to same parent score the same""" # both models are FP using children of a refA phenotype FP_close = Representation(name="close").set("Y:007", 0.9) FP_far = Representation(name="far").set("Y:008", 0.9) chain_close = self.rs.inference_chain(FP_close, "refA", verbose=True) chain_far = self.rs.inference_chain(FP_far, "refA", verbose=True) chain_close.evaluate() chain_far.evaluate() self.assertEqual(chain_close.posterior, chain_far.posterior)
def get_concise_refdict(dbpath): """transfer information on concise reference phenotypes into a dict.""" refdict = dict() refdict["null"] = Representation(name="null") refgenerator = DBGenerator(ReferenceConcisePhenotypeTable(dbpath)) for row in refgenerator.next(): rowid = row["id"] if rowid not in refdict: refdict[rowid] = Representation(name=rowid) refdict[rowid].set(row["phenotype"], row["value"]) return refdict
def test_prep_row_priors(self): """prepare row priors.""" # let ref universe have two annotations and one null refA = Representation(data=dict(a=1), name="refA") refA.defaults(zerovals) refB = Representation(data=dict(a=1, b=0.8), name="refB") refB.defaults(zerovals) rs = ReferenceSet(dict(refA=0.5, refB=0.5), ids=zerovals.keys()) rs.add(refA).add(refB) # compute feature priors rs.prep() # row_priors should gain key/values for all features expected_features = set(zerovals.keys()) self.assertEqual(set(rs.row_names), expected_features) # features declared in representations should get reasonable priors a_index = rs.rows["a"] b_index = rs.rows["b"] d_index = rs.rows["d"] self.assertEqual(rs.row_priors[a_index], 1, "refA and refB both have a") self.assertEqual(rs.row_priors[b_index], 0.4, "only refB has b, so 0.8/2") self.assertEqual(rs.row_priors[d_index], 0.2, "value is 1/num features")
def test_positive_parent_multi(self): """fetching a parent term when terms have multiple parents.""" # load an ontology in which Y7 is connected to both Y2 and Y1 Yfile = join(testdir, "Ymulti.obo") Yobo = MinimalObo(Yfile) Ydefaults = dict.fromkeys(Yobo.ids(), 0.0001) Ydefaults["Y:003"] = 0.0002 Ydefaults["Y:005"] = 0.0002 # make slight variations of representations rs = ReferenceSet(dict(refA=0.5, refB=0.5), ids=Yobo.ids(), row_priors=Ydefaults) refA = Representation(name="refA") refA.set("Y:002", 0.5).set("Y:005", 1).impute(Yobo, Ydefaults) refB = Representation(name="refB") refB.set("Y:001", 0.5).impute(Yobo, Ydefaults) rs.add(refA).add(refB) rs.learn_obo(Yobo) self.assertEqual( rs._positive_ancestor(rs.columns["refA"], rs.rows["Y:007"]), rs.rows["Y:002"], "Y2 is a positive ancestor") self.assertEqual( rs._positive_ancestor(rs.columns["refB"], rs.rows["Y:007"]), rs.rows["Y:001"], "Y1 is a positive immediate parent")
def test_FN_decrease_scaled_with_ref(self): """negative compared to positive decreases score""" FN1 = Representation(name="FN1").set("Y:005", 0.01) chain_FN1 = self.rs.inference_chain(FN1, "refB", verbose=True) chain_FN1.evaluate_inference() FN2 = Representation(name="FN2").set("Y:005", 0.01) chain_FN2 = self.rs.inference_chain(FN2, "refB2", verbose=True) chain_FN2.evaluate_inference() # both cases should lead to decrease in score self.assertLess(chain_FN1.posterior, chain_FN1.prior) self.assertLess(chain_FN2.posterior, chain_FN2.prior) # the decrease should be steeper in FN1 because refB positive is stronger self.assertLess(chain_FN1.posterior, chain_FN2.posterior)
def test_FN_decrease_scales_with_model(self): """negative compared to positive decreases score""" FN1 = Representation(name="FN").set("Y:005", 0.01) FN2 = Representation(name="FN").set("Y:005", 0.001) chain_FN1 = self.rs.inference_chain(FN1, "refB", verbose=True) chain_FN1.evaluate_inference() chain_FN2 = self.rs.inference_chain(FN2, "refB", verbose=True) chain_FN2.evaluate_inference() # both models should decrease score self.assertLess(chain_FN1.posterior, chain_FN1.prior) self.assertLess(chain_FN2.posterior, chain_FN2.prior) # second model should decrease more self.assertLess(chain_FN2.posterior, chain_FN1.posterior)
def test_FN_can_decrease_score(self): """negative compared to positive decreases score""" FN = Representation(name="FN").set("Y:005", 0.01) chain_FN = self.rs.inference_chain(FN, "refB", verbose=True) chain_FN.evaluate_inference() self.assertLess(chain_FN.posterior, chain_FN.prior)
def test_AN_leaves_score_unchanged(self): """negative compared to prior leaves score unchanged """ AN = Representation(name="FN").set("Y:005", 0.01) chain_AN = self.rs.inference_chain(AN, "refA", verbose=True) chain_AN.evaluate_inference() self.assertAlmostEqual(chain_AN.posterior, chain_AN.prior)
def test_empty_representation(self): """setting and getting from a generic representation.""" rr = Representation() self.assertEqual(len(rr.data), 0, "representation should be empty") self.assertEqual(rr.name, None, "representation should not have a name")
def test_add_without_name_raises(self): """adding a representation without a name raises exceptions.""" r1 = Representation().set("DOID:0014667", 0.4) rs = ReferenceSet(dict(refA=0.5, refB=0.5), ids=obo.ids()) with self.assertRaises(Exception): rs.add(r1)
def test_FP2_with_null(self): """FP2 is phenotype with a moderate high prior.""" FP2 = Representation(name="model").set("Y:002", 0.9) chain2 = self.rs.inference_chain(FP2, "null") chain2.evaluate() self.assertLessEqual(chain2.posterior, chain2.prior)
def test_FP3_with_null(self): """FP3 is phenotype with a fairly high prior.""" FP3 = Representation(name="model").set("Y:003", 0.9) chain3 = self.rs.inference_chain(FP3, "null", verbose=True) chain3.evaluate() self.assertLessEqual(chain3.posterior, chain3.prior)
def test_positive_parent(self): """fetching a parent term that has a positive value.""" rs = ReferenceSet(dict(refA=0.5), ids=Yobo.ids(), row_priors=Ydefaults) refA = Representation(name="refA") refA.set("Y:002", 0.5).set("Y:005", 1).impute(Yobo, Ydefaults) rs.add(refA) rs.learn_obo(Yobo) refAindex = rs.columns["refA"] self.assertEqual(rs._positive_ancestor(refAindex, rs.rows["Y:002"]), rs.rows["Y:002"], "Y2 is itself is positive") self.assertEqual(rs._positive_ancestor(refAindex, rs.rows["Y:007"]), rs.rows["Y:002"], "Y2 is immediate parent of Y7") self.assertEqual(rs._positive_ancestor(refAindex, rs.rows["Y:006"]), rs.rows["Y:005"], "Y5 is immediate parent of Y6")
def get_model_representations(dbpath, obo, log=None, log_prefix="", model_names=None): """transfer model phenotype data into representations""" if model_names is None: model_names = get_model_names(dbpath) model_names_set = set(model_names) result = dict() for m in model_names: result[m] = Representation(name=m) phen_priors = get_phenotype_priors(dbpath) generator = DBGenerator(ModelPhenotypeTable(dbpath)) for row in generator.next(): m, phenotype = row["id"], obo.canonical(row["phenotype"]) # avoid cases - irrelevant model, obsolete phenotype if m not in model_names_set: continue if obo.has(phenotype) and not obo.valid(phenotype): phenotype = obo.replaced_by(phenotype) if phenotype is None: if log is not None: msg = "Skipping phenotype " + row["phenotype"] msg += " in model " + m log(log_prefix + " - " + msg) continue result[m] = add_data_to_model(result[m], phenotype, row["value"], row["TPR"], row["FPR"], phen_priors) return result
def __init__(self, config, references=None, models=None, log=None, run_msg="Packet "): """A runnable class for processing a set of references and models :param config: object of class PhenoscoringConfig :param references: iterable with reference names :param models: iterable with model names :param stamp: timestamp for calculations :param log: a logging function :param run_msg: a template for a status message """ self.config = config self.stamp = config.stamp self.log = log self.run_msg = run_msg self.phen_priors = None self.ref_priors = None self.general_refset = None self.specific_refset = None self.models = dict() self.references = set() # setup calculation with references and stub models self.references = set(references) for id in models: self.models[id] = Representation(name=id)
def test_general_representation_get(self): """setting and getting from a generic representation.""" rr = Representation(dict(xyz=0.2)) rr.set("bob", 0.4) rr.set("xyz", 0.3) rr.defaults(self.defaults) self.assertEqual(rr.get("bob"), 0.4, "value should come from manual input") self.assertEqual(rr.get("abc"), 0.1, "value should come from defaults dict") self.assertEqual(rr.get("xyz"), 0.3, "value should come from manual override")
def test_specific_term_FP_increases(self): """score a specific model, vaguely similar disease should increase.""" rr = Representation(name="specific").set("Y:008", 0.8) inf = self.rs.inference(rr) self.assertGreater(inf["DISEASE:3"], inf["DISEASE:1"], "D3 has Y2, which is close to Y1")
def test_refset2_inference(self): """inference based on specific should be smaller than on general.""" rr = Representation(name="custom").set("Y:001", 0.8) inf_general = self.rs.inference(rr) inf_specific = self.rs2.inference(rr) # Y:001 is shared by DISEASE:1 and DISEASE:2, so specific inf down self.assertLess(inf_specific["DISEASE:1"], inf_general["DISEASE:1"])
def test_model_rootonly(self): """score a vague model.""" rr = Representation(name="vague").set("Y:004", 0.8) inf = self.rs.inference(rr) self.assertAlmostEqual(inf["DISEASE:1"], inf["DISEASE:2"], "most diseases about the same")
def test_FP7_with_null(self): """FP7 is phenotype with a low prior.""" FP7 = Representation(name="model").set("Y:007", 0.9) chain7 = self.rs.inference_chain(FP7, "null", verbose=True) chain7.evaluate() self.assertLess(chain7.posterior, chain7.prior, "must be strictly lower")
def test_impute_fromseeds_lowfirst(self): """imputing values from manually-specified seeds.""" rr = Representation(dict()) ## specify an inconsistent set of values, DOID:4 is higher in tree, so cannot ## have a lower value than DOID:0014667 rr.set("DOID:0014667", 0.4).set("DOID:4", 0.1) rr.impute(self.obo, self.obodef, seeds=["DOID:4", "DOID:0014667"]) self.assertEqual(rr.get("DOID:0080015"), 0.1, "child of DOID:4") self.assertEqual(rr.get("DOID:655"), 0.1, "child of DOID:4")
def test_general_representation_get2(self): """setting and getting from a generic representation.""" # Similar to previous, but setting defaults before the specifics rr = Representation(dict(abc=0.1, xyz=0.2)) rr.defaults(self.defaults) rr.set("bob", 0.4) rr.set("xyz", 0.3) self.assertEqual(rr.get("bob"), 0.4, "value should come from manual input") self.assertEqual(rr.get("abc"), 0.1, "value should come from defaults dict") self.assertEqual(rr.get("xyz"), 0.3, "value should come from manual override")
def test_TN_strong_model(self): """TN against strong negative ref should increase overall score""" # create two models that matches ref phenotypes TN TNA = Representation(name="model").set("Y:002", 0.01) TNB = Representation(name="model").set("Y:002", 0.001) # construct inference chains that give TNs chainA = self.rs.inference_chain(TNA, "refA", verbose=True) chainA.evaluate_inference() chainB = self.rs.inference_chain(TNB, "refA", verbose=True) chainB.evaluate_inference() # both comparisons should lead to higher scores self.assertGreater(chainA.posterior, chainA.prior) self.assertGreater(chainB.posterior, chainB.prior) # second comparison should end higher because refB is more strong negative self.assertGreater(chainB.posterior, chainA.posterior)