Exemple #1
0
    def test_average(self):
        """summarize phenotypes using an average (consistent values)."""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("MP:007", Experiment(0, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05))
        d4 = PhenotypeDatum("MP:007", Experiment(0, 0.4, 0.15))
        d5 = PhenotypeDatum("MP:009", Experiment(1, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        m.add(d4).add(d5)
        self.assertEqual(len(m.data), 5)

        # check that the average contains all phenotypes
        m.average()
        self.assertEqual(len(m.data), 3)
        expected_tpr = {"MP:002": 0.7, "MP:007": 0.4, "MP:009": 0.6}
        expected_fpr = {"MP:002": 0.05, "MP:007": 0.1, "MP:009": 0.05}
        expected_val = {"MP:002": 1, "MP:007": 0, "MP:009": 1}
        for i in range(3):
            iphen = m.data[i].phenotype
            iexp = m.data[i].experiment
            self.assertEqual(iexp.value, expected_val[iphen])
            self.assertEqual(iexp.tpr, expected_tpr[iphen])
            self.assertEqual(iexp.fpr, expected_fpr[iphen])
Exemple #2
0
    def test_consensus_imputed(self):
        """summarize multiple rows of phenotypes using a consensus, with imputed values"""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        # here add with value between 0 and 1
        d2 = PhenotypeDatum("MP:007", Experiment(0.6, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:007", Experiment(0.4, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        self.assertEqual(len(m.data), 3)

        # check that the consensus matches the inputs
        m.consensus()
        self.assertEqual(len(m.data), 2)
        c1 = m.data[0]
        c2 = m.data[1]
        expected_tpr = {"MP:002": 0.8, "MP:007": 0.5}
        expected_fpr = {"MP:002": 0.05, "MP:007": 0.05}
        expected_val = {"MP:002": 1, "MP:007": 0.5}
        for i in range(2):
            iphen = m.data[i].phenotype
            iexp = m.data[i].experiment
            self.assertEqual(iexp.value, expected_val[iphen])
            self.assertEqual(iexp.tpr, expected_tpr[iphen])
            self.assertEqual(iexp.fpr, expected_fpr[iphen])
 def test_compare_by_phenotype(self):
     """comparison should compare phenotypes, experiments, timestamps"""
     d1 = PhenotypeDatum("MP:1", e1, "2018")
     d2 = PhenotypeDatum("MP:1", e1, "2018")
     d3 = PhenotypeDatum("MP:2", e1, "2018")
     self.assertTrue(d1 == d2)
     self.assertTrue(d1 != d3)
Exemple #4
0
    def test_equivalent_same_phenotypes(self):
        """entities with the same phenotypes are equivalent"""

        m1, m2 = Entity("A", "X"), Entity("A", "X")
        m1.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        m1.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        self.assertTrue(m1.equivalent(m2))
        self.assertTrue(m2.equivalent(m1))
Exemple #5
0
    def test_trim_easy_keep(self):
        """trimming does not eliminate node if ask to keep."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2)

        self.assertEqual(len(m.data), 2)
        m.trim_ancestors(self.obo, set(["DOID:4"]))
        self.assertEqual(len(m.data), 2)
Exemple #6
0
    def test_trim_nothing(self):
        """trimming does nothing if there is nothing to do."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:3650", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2)

        self.assertEqual(len(m.data), 2)
        m.trim_ancestors(self.obo)
        self.assertEqual(len(m.data), 2)
Exemple #7
0
    def test_equivalent_different_phenotypes(self):
        """entities with the same phenotypes are equivalent"""

        m1, m2 = Entity("A", "X"), Entity("A", "X")
        # add phenotypes, but two
        m1.add(PhenotypeDatum("MP:001", Experiment(1, 0.8, 0.05)))
        m1.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        self.assertFalse(m1.equivalent(m2))
        self.assertFalse(m2.equivalent(m1))
Exemple #8
0
    def test_trim_easy(self):
        """trimming eliminates root node."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2)

        self.assertEqual(len(m.data), 2)
        m.trim_ancestors(self.obo)
        self.assertEqual(len(m.data), 1)
        self.assertEqual(m.data[0].phenotype, "DOID:11044")
Exemple #9
0
    def test_trim_medium(self):
        """trimming eliminates when when there are several leafs."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        d3 = PhenotypeDatum("DOID:0080015", Experiment(1, 0.8, 0.05))
        d4 = PhenotypeDatum("DOID:655", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2).add(d3).add(d4)

        self.assertEqual(len(m.data), 4)
        m.trim_ancestors(self.obo)
        self.assertEqual(len(m.data), 2)
        result = set([_.phenotype for _ in m.data])
        self.assertEqual(result, set(["DOID:11044", "DOID:655"]))
Exemple #10
0
    def test_add_phenotype_data(self):
        """cannot add corrupt data"""

        m = Entity("abc", "genes", marker_id="X:001", marker_symbol="x001")
        self.assertEqual(len(m.data), 0, "initial model has no pheontypes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.0555))
        m.add(d1)
        d2 = PhenotypeDatum("MP:007", Experiment(1, 0.456, 0.0234))
        m.add(d2)
        self.assertEqual(len(m.data), 2, "just added two phenotypes")
        # check content of each datum
        pheno_str_0 = str(m.data[0])
        pheno_str_1 = str(m.data[1])
        self.assertTrue("002" in pheno_str_0)
        self.assertTrue("555" in pheno_str_0)
        self.assertTrue("234" in pheno_str_1)
Exemple #11
0
    def test_equivalent_phenotypes(self):
        """entities with different phenotypes cannot be the same."""

        m1, m2 = Entity("A", "X"), Entity("A", "X")
        m1.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        self.assertFalse(m1.equivalent(m2))
        self.assertFalse(m2.equivalent(m1))
    def test_str(self):
        """object can be summarized."""

        datum = PhenotypeDatum("MP:001", e1)
        datum_str = repr(datum)
        self.assertTrue("MP:001" in datum_str)
        self.assertTrue("Experiment" in datum_str)
        self.assertTrue("0.8" in datum_str)
    def test_default_timestamp(self):
        """default should set a timestamp"""

        now = datetime.now()
        datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1))
        stamp = datetime.strptime(datum.timestamp, timestamp_format)
        diff = (now - stamp).total_seconds()
        self.assertLess(diff, 60, "stamps should be within a few seconds")
    def test_init(self):
        """init of basic object"""

        datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1))

        self.assertEqual(datum.phenotype, "MP:1")
        self.assertEqual(datum.value, 1)
        self.assertEqual(datum.tpr, 0.7)
        self.assertEqual(datum.fpr, 0.1)
        self.assertFalse(datum.timestamp is None)
Exemple #15
0
    def test_average_2(self):
        """summarize phenotypes using an average (discordant values)."""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("MP:002", Experiment(0, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        self.assertEqual(len(m.data), 3)

        # check that the consensus matches the inputs
        m.average()
        self.assertEqual(len(m.data), 1)
        self.assertEqual(m.data[0].phenotype, "MP:002")
        iexp = m.data[0].experiment
        self.assertGreater(iexp.value, 0)
        self.assertAlmostEqual(iexp.tpr, (0.8 + 0.0 + 0.6) / 3)
        self.assertAlmostEqual(iexp.fpr, 0.05)
Exemple #16
0
    def control1(self, refname, entity):
        """create a model with sibling phenotypes to a given model"""

        model = tech_model(refname + "_siblings", "siblings")
        model.set_description("control_for", refname)
        timestamp = self.timestamp
        for datum in entity.data:
            phen = datum.phenotype
            newphen = self._pick_sibling(phen)
            model.add(PhenotypeDatum(newphen, datum.experiment, timestamp))
        return model.trim_ancestors(self.obo)
Exemple #17
0
    def control0(self, refname, refrep):
        """create a model with the same phenotypes as a representation. """

        model = tech_model(refname + "_match", "match")
        model.set_description("control_for", refname)
        timestamp = self.timestamp
        for phen in refrep.keys():
            phen_value = refrep.get(phen)
            phen_exp = Experiment(1, self.tpr, self.fpr)
            model.add(PhenotypeDatum(phen, phen_exp, timestamp))
        return model.trim_ancestors(self.obo)
Exemple #18
0
    def test_consensus_2(self):
        """summarize multiple rows of phenotypes using a consensus with some discordance."""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("MP:002", Experiment(0, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        self.assertEqual(len(m.data), 3)

        # check that the consensus matches the inputs
        m.consensus()
        self.assertEqual(len(m.data), 1)
        c1 = m.data[0]
        iphen = m.data[0].phenotype
        iexp = m.data[0].experiment
        self.assertEqual(iexp.value, 1)
        # the tpr will be lower than (0.6+0.8)/2
        # it should be (0.7*2/3)
        self.assertEqual(iexp.tpr, 0.7 * (2 / 3))
        self.assertEqual(iexp.fpr, 0.05)
Exemple #19
0
def get_gxd(gxd_path, emp_map, tprfpr):
    """read a file with marker-emapa associationss
    
    Arguments:
        gxd_path   file with columns ....
        emp_map      dict mapping EMAPA ids to other ids
        tprfpr     2-tuple with (tpr, fpr)
    
    Returns:
        dict mapping markers to phenotypes terms
    """

    tpr = tprfpr[0]
    fpr = tprfpr[1]

    # get all the mapping from the raw file
    result = dict()
    with open_file(gxd_path, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="'")
        for row in reader:
            feature = row["feature.primaryIdentifier"]
            emapa = row["structure.identifier"]
            strength = row["strength"]

            if feature not in result:
                modelid = "GXD_" + feature
                result[feature] = Entity(modelid,
                                         "expression",
                                         marker_id=feature)
                result[feature].set_description("expression", 1)
                result[feature].set_description("source", "GXD")

            if emapa not in emp_map:
                continue
            if strength not in gxd_strength:
                continue

            # determine whether to add a positive or negative phenotype
            strength_factor = gxd_strength[strength]
            row_exp = Experiment(1, fpr + (tpr - fpr) * strength_factor, fpr)
            if strength == "Absent":
                row_exp.value = 0
            for mp in emp_map[emapa]:
                result[feature].add(PhenotypeDatum(mp, row_exp))

    # get a concensus value
    for id in result:
        result[id].consensus()

    return result
Exemple #20
0
def prep_IMPC(datapath, tprfpr, pthreshold, simplify="average", obo=None):
    """parse IMPC statistical results and assemble a set of models.
    
    Args:
        datapath:    path to MGI raw file            
        tprfpr:      list with two elements (tpr, fpr)
        pthreshold:  float, minimum threshold for significance
        simplify:    string, method for simplifying multiple data type
                    (use 'none', 'average', or 'consensus')
        obo:         object of class MinimalObo
    """

    models = dict()
    if datapath is None:
        return models

    now = now_timestamp()
    base_tpr, base_fpr = tprfpr[0], tprfpr[1]
    male = set(["M", "B", "U"])
    female = set(["F", "B", "U"])

    def create_models(id, category, zygosity, row):
        """Create a family of model definitions, for sex=FMU, neg_phen=01"""

        prefix = "IMPC_" + id + "_" + zygosity + "_"
        for suffix in ["F", "FA", "M", "MA", "U", "UA"]:
            id = prefix + suffix
            if id not in models:
                models[id] = impc_model(id, category, row, zygosity)
                models[id].set_description("sex", sex_code(suffix))
                with_negative = negative_code(suffix)
                models[id].set_description("neg_phenotypes", with_negative)

    def add_to_model(datum, id, zygosity, suffix):
        """add a datum into an existing model definition.
        
        Arguments:
            datum      phenotype and experiment result
            id, zygosity, suffix
                       characterization of model
        """
        id = "IMPC_" + id + "_" + zygosity + "_" + suffix
        models[id].add(datum)

    def add_set_to_models(datum, row, val, sex):
        """helper to add a set of models, for alleles, markers
        
        Arguments:
            datum     phenotype and experiment result
            row       dict
            val       value of phenotype (0/1)
            sex       one-letter code
        """
        zygosity = (row["zygosity"])[:3]
        zygosity = "hom" if zygosity == "hem" else zygosity
        marker = row["marker_accession_id"]
        allele = row["allele_accession_id"]
        # perhaps create model definitions
        create_models(marker, "marker", zygosity, row)
        create_models(allele, "allele", zygosity, row)
        # record phenotypes into the models
        if val == 1:
            add_to_model(datum, marker, zygosity, sex)
            add_to_model(datum, allele, zygosity, sex)
        add_to_model(datum, marker, zygosity, sex + "A")
        add_to_model(datum, allele, zygosity, sex + "A")

    # get a map from parameter to mp_terms - used for negative phenotypes
    parameter_phenotype_map = get_parameter_phenotype_map(datapath, obo)

    with open_file(datapath, "rt") as f:
        reader = csv.DictReader(f, delimiter=",", quotechar="\"")
        for row in reader:
            # skip over bad data rows
            if row["status"] not in ("Success", "Successful"):
                continue
            if row["allele_symbol"] == "":
                continue

            # get a phenotype MP id
            phenotype = row["mp_term_id"].strip()
            # redefine some phenotypes
            # (this handles morphology MP:0002169 annotations)
            parameter = row["parameter_name"].strip()
            if phenotype + " " + parameter in redef:
                phenotype = redef[phenotype + " " + parameter]
            if phenotype == "" and parameter in parameter_phenotype_map:
                phenotype = parameter_phenotype_map[parameter]
            if phenotype == "" or phenotype == "MP:0002169":
                continue

            sex = sex_code(row["phenotype_sex"])

            # identify whether this is a positive or a negative phenotype
            value = get_value(row, pthreshold)

            # add data at marker level, allele level, by gender
            hit = Experiment(value, base_tpr, base_fpr)
            datum = PhenotypeDatum(phenotype, hit, now)
            add_set_to_models(datum, row, value, "U")
            if sex in male:
                add_set_to_models(datum, row, value, "M")
            if sex in female:
                add_set_to_models(datum, row, value, "F")

    # some models may have redundant rows (e.g. a phenotype recorded twice)
    # so collapse into a consensus here
    if simplify == "consensus":
        for id in models:
            models[id].consensus()
    elif simplify == "average":
        for id in models:
            models[id].average()
    return models
 def test_order_by_timestamp(self):
     d1 = PhenotypeDatum("MP:1", e1, "2016")
     d2 = PhenotypeDatum("MP:1", e1, "2018")
     self.assertTrue(d1 < d2)
     self.assertTrue(d2 > d1)
 def test_order_by_value(self):
     d1 = PhenotypeDatum("MP:1", e2, "2018")
     d2 = PhenotypeDatum("MP:1", e1, "2018")
     self.assertTrue(d1 < d2)
     self.assertTrue(d2 > d1)
 def test_compare_by_timestamp(self):
     """comparison should compare phenotypes, experiments, timestamps"""
     d1 = PhenotypeDatum("MP:1", e1, "2018")
     d2 = PhenotypeDatum("MP:1", e1, "2017")
     self.assertFalse(d1 == d2)