def set_ic_annot_freq(ontology: Ontology, annotations: AssociationSet):
    logger.info(
        "Setting information content values based on annotation frequency")
    for node_id in ontology.nodes():
        node_prop = ontology.node(node_id)
        if "rel_annot_genes" in node_prop:
            del node_prop["rel_annot_genes"]
        if "tot_annot_genes" in node_prop:
            del node_prop["tot_annot_genes"]
        if "IC" in node_prop:
            del node_prop["IC"]
    for root_id in ontology.get_roots():
        if "depth" not in ontology.node(root_id) and (
                "type" not in ontology.node(root_id)
                or ontology.node_type(root_id) == "CLASS"):
            set_all_depths_in_subgraph(ontology=ontology, root_id=root_id)
    node_gene_map = defaultdict(set)
    for subj, obj in annotations.associations_by_subj_obj.keys():
        node_gene_map[obj].add(subj)
    for node_id in ontology.nodes():
        node_pr = ontology.node(node_id)
        node_pr["rel_annot_genes"] = node_gene_map[node_id]
    for root_id in ontology.get_roots():
        _set_tot_annots_in_subgraph(ontology, root_id)
    for node_prop in ontology.nodes().values():
        if "tot_annot_genes" not in node_prop:
            node_prop["tot_annot_genes"] = set()
    tot_annots = len(
        set([
            gene for set_genes in node_gene_map.values() for gene in set_genes
        ]))
    min_annots = min([
        len(node["tot_annot_genes"]) for node in ontology.nodes().values()
        if "tot_annot_genes" in node and len(node["tot_annot_genes"]) > 0
    ])
    if not min_annots:
        min_annots = 1
    for node_prop in ontology.nodes().values():
        node_prop["IC"] = -math.log(len(node_prop["tot_annot_genes"]) / tot_annots) if \
            len(node_prop["tot_annot_genes"]) > 0 else -math.log(min_annots / (tot_annots + 1))
    logger.info("Finished setting information content values")
def set_all_depths(ontology: Ontology,
                   relations: List[str] = None,
                   comparison_func=max):
    for root_id in ontology.get_roots():
        if "type" not in ontology.node(root_id) or ontology.node_type(
                root_id) == "CLASS":
            set_all_depths_in_subgraph(ontology=ontology,
                                       root_id=root_id,
                                       relations=relations,
                                       comparison_func=comparison_func)
    for node_id, node_content in ontology.nodes().items():
        if "depth" not in node_content:
            node_content["depth"] = 0
Esempio n. 3
0
class LexicalMapEngine():
    """
    generates lexical matches between pairs of ontology classes
    """

    SCORE = 'score'
    LEXSCORE = 'lexscore'
    SIMSCORES = 'simscores'
    CONDITIONAL_PR = 'cpr'

    def __init__(self, wsmap=default_wsmap(), config=None):
        """
        Arguments
        ---------
        wdmap: dict
            maps words to normalized synonyms.
        config: dict
            A configuration conforming to LexicalMapConfigSchema
        """
        # maps label or syn value to Synonym object
        self.lmap = {}
        # maps node id to synonym objects
        self.smap = {}
        self.wsmap = wsmap
        self.npattern = re.compile('[\W_]+')
        self.exclude_obsolete = True
        self.ontology_pairs = None
        self.id_to_ontology_map = defaultdict(list)
        self.merged_ontology = Ontology()
        self.config = config if config is not None else {}
        self.stats = {}

    def index_ontologies(self, onts):
        logging.info('Indexing: {}'.format(onts))
        for ont in onts:
            self.index_ontology(ont)

    def index_ontology(self, ont):
        """
        Adds an ontology to the index

        This iterates through all labels and synonyms in the ontology, creating an index
        """
        self.merged_ontology.merge([ont])
        syns = ont.all_synonyms(include_label=True)

        include_id = self._is_meaningful_ids()
        logging.info("Include IDs as synonyms: {}".format(include_id))
        if include_id:
            for n in ont.nodes():
                v = n
                # Get fragment
                if v.startswith('http'):
                    v = re.sub('.*/', '', v)
                    v = re.sub('.*#', '', v)
                syns.append(Synonym(n, val=v, pred='label'))

        logging.info("Indexing {} syns in {}".format(len(syns), ont))
        logging.info("Distinct lexical values: {}".format(len(
            self.lmap.keys())))
        for syn in syns:
            self.index_synonym(syn, ont)
        for nid in ont.nodes():
            self.id_to_ontology_map[nid].append(ont)

    def label(self, nid):
        return self.merged_ontology.label(nid)

    def index_synonym(self, syn, ont):
        """
        Index a synonym

        Typically not called from outside this object; called by `index_ontology`
        """
        if not syn.val:
            if syn.pred == 'label':
                if not self._is_meaningful_ids():
                    if not ont.is_obsolete(syn.class_id):
                        pass
                        #logging.error('Use meaningful ids if label not present: {}'.format(syn))
            else:
                logging.warning("Incomplete syn: {}".format(syn))
            return
        if self.exclude_obsolete and ont.is_obsolete(syn.class_id):
            return

        syn.ontology = ont
        prefix, _ = ont.prefix_fragment(syn.class_id)

        v = syn.val

        caps_match = re.match('[A-Z]+', v)
        if caps_match:
            # if > 75% of length is caps, assume abbreviation
            if caps_match.span()[1] >= len(v) / 3:
                syn.is_abbreviation(True)

        # chebi 'synonyms' are often not real synonyms
        # https://github.com/ebi-chebi/ChEBI/issues/3294
        if not re.match('.*[a-zA-Z]', v):
            if prefix != 'CHEBI':
                logging.warning('Ignoring suspicous synonym: {}'.format(syn))
            return

        v = self._standardize_label(v)

        # TODO: do this once ahead of time
        wsmap = {}
        for w, s in self.wsmap.items():
            wsmap[w] = s
        for ss in self._get_config_val(prefix, 'synsets', []):
            # TODO: weights
            wsmap[ss['synonym']] = ss['word']
        nv = self._normalize_label(v, wsmap)

        self._index_synonym_val(syn, v)
        nweight = self._get_config_val(prefix, 'normalized_form_confidence',
                                       0.8)
        if nweight > 0 and not syn.is_abbreviation():
            if nv != v:
                nsyn = Synonym(syn.class_id,
                               val=syn.val,
                               pred=syn.pred,
                               lextype=syn.lextype,
                               ontology=ont,
                               confidence=syn.confidence * nweight)
                self._index_synonym_val(nsyn, nv)

    def _index_synonym_val(self, syn, v):
        lmap = self.lmap
        smap = self.smap
        cid = syn.class_id
        if v not in lmap:
            lmap[v] = []
        lmap[v].append(syn)
        if cid not in smap:
            smap[cid] = []
        smap[cid].append(syn)

    def _standardize_label(self, v):
        # Add spaces separating camelcased strings
        v = re.sub('([a-z])([A-Z])', r'\1 \2', v)

        # always use lowercase when comparing
        # we may want to make this configurable in future
        v = v.lower()
        return v

    def _normalize_label(self, s, wsmap):
        """
        normalized form of a synonym
        """
        toks = []
        for tok in list(set(self.npattern.sub(' ', s).split(' '))):
            if tok in wsmap:
                tok = wsmap[tok]
            if tok != "":
                toks.append(tok)
        toks.sort()
        return " ".join(toks)

    def _get_config_val(self, prefix, k, default=None):
        v = None
        for oc in self.config.get('ontology_configurations', []):
            if prefix == oc.get('prefix', ''):
                v = oc.get(k, None)
        if v is None:
            v = self.config.get(k, None)
        if v is None:
            v = default
        return v

    def _is_meaningful_ids(self):
        return self.config.get('meaningful_ids', False)

    def find_equiv_sets(self):
        return self.lmap

    def get_xref_graph(self):
        """

        Generate mappings based on lexical properties and return as nx graph.

        Algorithm
        ~~~~~~~~~

        - A dictionary is stored between ref:`Synonym` values and synonyms. See ref:`index_synonym`.
          Note that Synonyms include the primary label

        - Each key in the dictionary is examined to determine if there exist two Synonyms from
          different ontology classes

        This avoids N^2 pairwise comparisons: instead the time taken is linear

        After initial mapping is made, additional scoring is performed on each mapping

        Edge properties
        ~~~~~~~~~~~~~~~
        The return object is a nx graph, connecting pairs of ontology classes.

        Edges are annotated with metadata about how the match was found:

        syns: pair
            pair of `Synonym` objects, corresponding to the synonyms for the two nodes
        score: int
            score indicating strength of mapping, between 0 and 100

        Returns
        -------
        Graph
            nx graph (bidirectional)
        """

        # initial graph; all matches
        g = nx.MultiDiGraph()

        # lmap collects all syns by token
        items = self.lmap.items()
        logging.info("collecting initial xref graph, items={}".format(
            len(items)))
        i = 0
        sum_nsyns = 0
        n_skipped = 0
        has_self_comparison = False
        if self.ontology_pairs:
            for (o1id, o2id) in self.ontology_pairs:
                if o1id == o2id:
                    has_self_comparison = True

        for (v, syns) in items:
            sum_nsyns += len(syns)
            i += 1
            if i % 1000 == 1:
                logging.info(
                    '{}/{}  lexical items avgSyns={}, skipped={}'.format(
                        i, len(items), sum_nsyns / len(items), n_skipped))
            if len(syns) < 2:
                n_skipped += 1
                next
            if len(syns) > 10:
                logging.info('Syns for {} = {}'.format(v, len(syns)))
            for s1 in syns:
                s1oid = s1.ontology.id
                s1cid = s1.class_id
                for s2 in syns:
                    # optimization step: although this is redundant with _is_comparable,
                    # we avoid inefficient additional calls
                    if s1oid == s2.ontology.id and not has_self_comparison:
                        next
                    if s1cid != s2.class_id:
                        if self._is_comparable(s1, s2):
                            g.add_edge(s1.class_id, s2.class_id, syns=(s1, s2))

        logging.info("getting best supporting synonym pair for each match")
        # graph of best matches
        xg = nx.Graph()
        for i in g.nodes():
            for j in g.neighbors(i):
                best = 0
                bestm = None
                for m in g.get_edge_data(i, j).values():
                    (s1, s2) = m['syns']
                    score = self._combine_syns(s1, s2)
                    if score > best:
                        best = score
                        bestm = m
                syns = bestm['syns']
                xg.add_edge(i,
                            j,
                            score=best,
                            lexscore=best,
                            syns=syns,
                            idpair=(i, j))

        self.score_xrefs_by_semsim(xg)
        self.assign_best_matches(xg)
        if self.merged_ontology.xref_graph is not None:
            self.compare_to_xrefs(xg, self.merged_ontology.xref_graph)
        else:
            logging.error("No xref graph for merged ontology")
        logging.info("finished xref graph")
        return xg

    # true if syns s1 and s2 should be compared.
    #  - if ontology_pairs is set, then only consider (s1,s2) if their respective source ontologies are in the list of pairs
    #  - otherwise compare all classes, but only in one direction
    def _is_comparable(self, s1, s2):
        if s1.class_id == s2.class_id:
            return False
        if self.ontology_pairs is not None:
            #logging.debug('TEST: {}{} in {}'.format(s1.ontology.id, s2.ontology.id, self.ontology_pairs))
            return (s1.ontology.id, s2.ontology.id) in self.ontology_pairs
        else:
            return s1.class_id < s2.class_id

    def _blanket(self, nid):
        nodes = set()
        for ont in self.id_to_ontology_map[nid]:
            nodes.update(ont.ancestors(nid))
            nodes.update(ont.descendants(nid))
        return list(nodes)

    def score_xrefs_by_semsim(self, xg, ont=None):
        """
        Given an xref graph (see ref:`get_xref_graph`), this will adjust scores based on
        the semantic similarity of matches.
        """
        logging.info(
            "scoring xrefs by semantic similarity for {} nodes in {}".format(
                len(xg.nodes()), ont))
        for (i, j, d) in xg.edges(data=True):
            pfx1 = self._id_to_ontology(i)
            pfx2 = self._id_to_ontology(j)
            ancs1 = self._blanket(i)
            ancs2 = self._blanket(j)
            s1, _, _ = self._sim(xg, ancs1, ancs2, pfx1, pfx2)
            s2, _, _ = self._sim(xg, ancs2, ancs1, pfx2, pfx1)
            s = 1 - ((1 - s1) * (1 - s2))
            logging.debug("Score {} x {} = {} x {} = {} // {}".format(
                i, j, s1, s2, s, d))
            xg[i][j][self.SIMSCORES] = (s1, s2)
            xg[i][j][self.SCORE] *= s

    def _sim(self, xg, ancs1, ancs2, pfx1, pfx2):
        """
        Compare two lineages
        """
        xancs1 = set()
        for a in ancs1:
            if a in xg:
                # TODO: restrict this to neighbors in single ontology
                for n in xg.neighbors(a):
                    pfx = self._id_to_ontology(n)
                    if pfx == pfx2:
                        xancs1.add(n)
        logging.debug('SIM={}/{} ## {}'.format(len(xancs1.intersection(ancs2)),
                                               len(xancs1),
                                               xancs1.intersection(ancs2),
                                               xancs1))
        n_shared = len(xancs1.intersection(ancs2))
        n_total = len(xancs1)
        return (1 + n_shared) / (1 + n_total), n_shared, n_total

    # given an ontology class id,
    # return map keyed by ontology id, value is a list of (score, ext_class_id) pairs
    def _neighborscores_by_ontology(self, xg, nid):
        xrefmap = defaultdict(list)
        for x in xg.neighbors(nid):
            score = xg[nid][x][self.SCORE]
            for ont in self.id_to_ontology_map[x]:
                xrefmap[ont.id].append((score, x))
        return xrefmap

    # normalize direction
    def _dirn(self, edge, i, j):
        if edge['idpair'] == (i, j):
            return 'fwd'
        elif edge['idpair'] == (j, i):
            return 'rev'
        else:
            return None

    def _id_to_ontology(self, id):
        return self.merged_ontology.prefix(id)
        #onts = self.id_to_ontology_map[id]
        #if len(onts) > 1:
        #    logging.warning(">1 ontology for {}".format(id))

    def compare_to_xrefs(self, xg1, xg2):
        """
        Compares a base xref graph with another one
        """
        ont = self.merged_ontology
        for (i, j, d) in xg1.edges(data=True):
            ont_left = self._id_to_ontology(i)
            ont_right = self._id_to_ontology(j)
            unique_lr = True
            num_xrefs_left = 0
            same_left = False
            if i in xg2:
                for j2 in xg2.neighbors(i):
                    ont_right2 = self._id_to_ontology(j2)
                    if ont_right2 == ont_right:
                        unique_lr = False
                        num_xrefs_left += 1
                        if j2 == j:
                            same_left = True
            unique_rl = True
            num_xrefs_right = 0
            same_right = False
            if j in xg2:
                for i2 in xg2.neighbors(j):
                    ont_left2 = self._id_to_ontology(i2)
                    if ont_left2 == ont_left:
                        unique_rl = False
                        num_xrefs_right += 1
                        if i2 == i:
                            same_right = True

            (x, y) = d['idpair']
            xg1[x][y]['left_novel'] = num_xrefs_left == 0
            xg1[x][y]['right_novel'] = num_xrefs_right == 0
            xg1[x][y]['left_consistent'] = same_left
            xg1[x][y]['right_consistent'] = same_right

    def assign_best_matches(self, xg):
        """
        For each node in the xref graph, tag best match edges
        """
        logging.info("assigning best matches for {} nodes".format(
            len(xg.nodes())))
        for i in xg.nodes():
            xrefmap = self._neighborscores_by_ontology(xg, i)
            for (ontid, score_node_pairs) in xrefmap.items():
                score_node_pairs.sort(reverse=True)
                (best_score, best_node) = score_node_pairs[0]
                logging.info("BEST for {}: {} in {} from {}".format(
                    i, best_node, ontid, score_node_pairs))
                edge = xg[i][best_node]
                dirn = self._dirn(edge, i, best_node)
                best_kwd = 'best_' + dirn
                if len(score_node_pairs
                       ) == 1 or score_node_pairs[0] > score_node_pairs[1]:
                    edge[best_kwd] = 2
                else:
                    edge[best_kwd] = 1
                for (score, j) in score_node_pairs:
                    edge_ij = xg[i][j]
                    dirn_ij = self._dirn(edge_ij, i, j)
                    edge_ij['cpr_' + dirn_ij] = score / sum(
                        [s for s, _ in score_node_pairs])
        for (i, j, edge) in xg.edges(data=True):
            # reciprocal score is set if (A) i is best for j, and (B) j is best for i
            rs = 0
            if 'best_fwd' in edge and 'best_rev' in edge:
                rs = edge['best_fwd'] * edge['best_rev']
            edge['reciprocal_score'] = rs
            edge['cpr'] = edge['cpr_fwd'] * edge['cpr_rev']

    def _best_match_syn(self, sx, sys, scope_map):
        """
        The best match is determined by the highest magnitude weight
        """
        SUBSTRING_WEIGHT = 0.2
        WBEST = None
        sbest = None
        sxv = self._standardize_label(sx.val)
        sxp = self._id_to_ontology(sx.class_id)
        for sy in sys:
            syv = self._standardize_label(sy.val)
            syp = self._id_to_ontology(sy.class_id)
            W = None
            if sxv == syv:
                confidence = sx.confidence * sy.confidence
                if sx.is_abbreviation() or sy.is_abbreviation:
                    confidence *= self._get_config_val(
                        sxp, 'abbreviation_confidence', 0.5)
                    confidence *= self._get_config_val(
                        syp, 'abbreviation_confidence', 0.5)
                W = scope_map[sx.scope()][sy.scope()] + logit(confidence / 2)
            elif sxv in syv:
                W = np.array((-SUBSTRING_WEIGHT, SUBSTRING_WEIGHT, 0, 0))
            elif syv in sxv:
                W = np.array((SUBSTRING_WEIGHT, -SUBSTRING_WEIGHT, 0, 0))
            if W is not None:
                # The best match is determined by the highest magnitude weight
                if WBEST is None or max(abs(W)) > max(abs(WBEST)):
                    WBEST = W
                    sbest = sy
        return WBEST, sbest

    def weighted_axioms(self, x, y, xg):
        """
        return a tuple (sub,sup,equiv,other) indicating estimated prior probabilities for an interpretation of a mapping
        between x and y.

        See kboom paper
        """
        # TODO: allow additional weighting
        # weights are log odds w=log(p/(1-p))
        # (Sub,Sup,Eq,Other)
        scope_pairs = [('label', 'label', 0.0, 0.0, 3.0, -0.8),
                       ('label', 'exact', 0.0, 0.0, 2.5, -0.5),
                       ('label', 'broad', -1.0, 1.0, 0.0, 0.0),
                       ('label', 'narrow', 1.0, -1.0, 0.0, 0.0),
                       ('label', 'related', 0.0, 0.0, 0.0, 0.0),
                       ('exact', 'exact', 0.0, 0.0, 2.5, -0.5),
                       ('exact', 'broad', -1.0, 1.0, 0.0, 0.0),
                       ('exact', 'narrow', 1.0, -1.0, 0.0, 0.0),
                       ('exact', 'related', 0.0, 0.0, 0.0, 0.0),
                       ('related', 'broad', -0.5, 0.5, 0.0, 0.0),
                       ('related', 'narrow', 0.5, -0.5, 0.0, 0.0),
                       ('related', 'related', 0.0, 0.0, 0.0, 0.0),
                       ('broad', 'broad', 0.0, 0.0, 0.0, 1.0),
                       ('broad', 'narrow', -0.5, 0.5, 0.0, 0.2),
                       ('narrow', 'narrow', 0.0, 0.0, 0.0, 0.0)]
        # populate symmetric lookup matrix
        scope_map = defaultdict(dict)
        for (l, r, w1, w2, w3, w4) in scope_pairs:
            l = l.upper()
            r = r.upper()
            scope_map[l][r] = np.array((w1, w2, w3, w4))
            scope_map[r][l] = np.array((w2, w1, w3, w4))

        # TODO: get prior based on ontology pair
        # cumulative sum of weights
        WS = None
        pfx1 = self._id_to_ontology(x)
        pfx2 = self._id_to_ontology(y)
        for mw in self.config.get('match_weights', []):
            mpfx1 = mw.get('prefix1', '')
            mpfx2 = mw.get('prefix2', '')
            X = np.array(mw['weights'])
            if mpfx1 == pfx1 and mpfx2 == pfx2:
                WS = X
            elif mpfx2 == pfx1 and mpfx1 == pfx2:
                WS = self._flipweights(X)
            elif mpfx1 == pfx1 and mpfx2 == '' and WS is None:
                WS = X
            elif mpfx2 == pfx1 and mpfx1 == '' and WS is None:
                WS = self._flipweights(X)

        if WS is None:
            WS = np.array((0.0, 0.0, 0.0, 0.0))
        # defaults
        WS += np.array(
            self.config.get('default_weights', [0.0, 0.0, 1.5, -0.1]))
        logging.info('WS defaults={}'.format(WS))

        for xw in self.config.get('xref_weights', []):
            left = xw.get('left', '')
            right = xw.get('right', '')
            X = np.array(xw['weights'])
            if x == left and y == right:
                WS += X
                logging.info('MATCH: {} for {}-{}'.format(X, x, y))
            elif y == left and x == right:
                WS += self._flipweights(X)
                logging.info('IMATCH: {}'.format(X))

        smap = self.smap
        # TODO: symmetrical
        WT = np.array((0.0, 0.0, 0.0, 0.0))
        WBESTMAX = np.array((0.0, 0.0, 0.0, 0.0))
        n = 0
        for sx in smap[x]:
            WBEST, _ = self._best_match_syn(sx, smap[y], scope_map)
            if WBEST is not None:
                WT += WBEST
                n += 1
                if max(abs(WBEST)) > max(abs(WBESTMAX)):
                    WBESTMAX = WBEST
        for sy in smap[y]:
            WBEST, _ = self._best_match_syn(sy, smap[x], scope_map)
            if WBEST is not None:
                WT += WBEST
                n += 1
        # average best match
        if n > 0:
            logging.info('Adding BESTMAX={}'.format(WBESTMAX))
            WS += WBESTMAX

        # TODO: xref, many to many
        WS += self._graph_weights(x, y, xg)
        # TODO: include additional defined weights, eg ORDO
        logging.info('Adding WS, gw={}'.format(WS))

        # jaccard similarity
        (ss1, ss2) = xg[x][y][self.SIMSCORES]
        WS[3] += ((1 - ss1) + (1 - ss2)) / 2

        # reciprocal best hits are higher confidence of equiv
        rs = xg[x][y]['reciprocal_score']
        if rs == 4:
            WS[2] += 0.5
        if rs == 0:
            WS[2] -= 0.2

        #P = np.expit(WS)
        P = 1 / (1 + np.exp(-WS))
        logging.info('Final WS={}, init P={}'.format(WS, P))
        # probs should sum to 1.0
        P = P / np.sum(P)
        return P

    def _graph_weights(self, x, y, xg):
        ont = self.merged_ontology
        xancs = ont.ancestors(x)
        yancs = ont.ancestors(y)
        pfx = self._id_to_ontology(x)
        pfy = self._id_to_ontology(y)
        xns = [
            n for n in xg.neighbors(y)
            if n != x and pfx == self._id_to_ontology(n)
        ]
        yns = [
            n for n in xg.neighbors(x)
            if n != y and pfy == self._id_to_ontology(n)
        ]
        pweight = 1.0
        W = np.array((0, 0, 0, 0))
        card = '11'
        if len(xns) > 0:
            card = 'm1'
            for x2 in xns:
                if x2 in xancs:
                    W[0] += pweight
                if x in ont.ancestors(x2):
                    W[1] += pweight
        if len(yns) > 0:
            if card == '11':
                card = '1m'
            else:
                card = 'mm'
            for y2 in yns:
                if y2 in yancs:
                    W[1] += pweight
                if y in ont.ancestors(y2):
                    W[0] += pweight

        logging.debug('CARD: {}/{} <-> {}/{} = {} // X={} Y={} // W={}'.format(
            x, pfx, y, pfy, card, xns, yns, W))
        invcard = card
        if card == '1m':
            invcard = 'm1'
        elif card == 'm1':
            invcard = '1m'

        CW = None
        DEFAULT_CW = None
        for cw in self.config.get('cardinality_weights', []):
            if 'prefix1' not in cw and 'prefix2' not in cw:
                if card == cw['cardinality']:
                    DEFAULT_CW = np.array(cw['weights'])
                if invcard == cw['cardinality']:
                    DEFAULT_CW = self._flipweights(np.array(cw['weights']))
            if 'prefix1' in cw and 'prefix2' in cw:
                if pfx == cw['prefix1'] and pfy == cw[
                        'prefix2'] and card == cw['cardinality']:
                    CW = np.array(cw['weights'])

                if pfx == cw['prefix2'] and pfy == cw[
                        'prefix1'] and invcard == cw['cardinality']:
                    CW = self._flipweights(np.array(cw['weights']))

        if CW is None:
            if DEFAULT_CW is not None:
                CW = DEFAULT_CW
            else:
                if card == '11':
                    CW = np.array((0.0, 0.0, 1.0, 0.0))
                elif card == '1m':
                    CW = np.array((0.6, 0.4, 0.0, 0.0))
                elif card == 'm1':
                    CW = np.array((0.4, 0.6, 0.0, 0.0))
                elif card == 'mm':
                    CW = np.array((0.2, 0.2, 0.0, 0.5))
        return W + CW

    def _flipweights(self, W):
        return np.array((W[1], W[0], W[2], W[3]))

    def grouped_mappings(self, id):
        """
        return all mappings for a node, grouped by ID prefix
        """
        g = self.get_xref_graph()
        m = {}
        for n in g.neighbors(id):
            [prefix, local] = n.split(':')
            if prefix not in m:
                m[prefix] = []
            m[prefix].append(n)
        return m

    def unmapped_nodes(self, xg, rs_threshold=0):
        unmapped_set = set()
        for nid in self.merged_ontology.nodes():
            if nid in xg:
                for (j, edge) in xg[nid].items():
                    rs = edge.get('reciprocal_score', 0)
                    if rs < rs_threshold:
                        unmapped_set.add(nid)
            else:
                unmapped_set.add(nid)
        return unmapped_set

    def unmapped_dataframe(self, xg, **args):
        unodes = self.unmapped_nodes(xg, **args)
        ont = self.merged_ontology
        eg = ont.equiv_graph()
        items = []
        for n in unodes:
            mapped_equivs = ''
            if n in eg:
                equivs = set(eg.neighbors(n))
                mapped_equivs = list(equivs - unodes)
            items.append(
                dict(id=n, label=ont.label(n), mapped_equivs=mapped_equivs))
        df = pd.DataFrame(items, columns=['id', 'label', 'mapped_equivs'])
        df = df.sort_values(["id"])
        return df

    # scores a pairwise combination of synonyms. This will be a mix of
    #  * individual confidence in the synonyms themselves
    #  * confidence of equivalence based on scopes
    # TODO: unify this with probabilistic calculation
    def _combine_syns(self, s1, s2):
        cpred = self._combine_preds(s1.pred, s2.pred)
        s = self._pred_score(cpred)
        s *= s1.confidence * s2.confidence
        if s1.is_abbreviation() or s2.is_abbreviation():
            s *= self._get_config_val(self._id_to_ontology(s1.class_id),
                                      'abbreviation_confidence', 0.5)
            s *= self._get_config_val(self._id_to_ontology(s1.class_id),
                                      'abbreviation_confidence', 0.5)
        logging.debug("COMBINED: {} + {} = {}/{}".format(s1, s2, cpred, s))
        return round(s)

    def _rollup(self, p):
        if p == 'label':
            return LABEL_OR_EXACT
        if p == 'hasExactSynonym':
            return LABEL_OR_EXACT
        return p

    def _combine_preds(self, p1, p2):
        if p1 == p2:
            return p1
        if self._rollup(p1) == self._rollup(p2):
            return self._rollup(p1)
        return p1 + p2

    ## TODO: allow this to be weighted by ontology
    def _pred_score(self, p):
        if p == 'label':
            return 100
        if p == LABEL_OR_EXACT:
            return 90
        if p == 'hasExactSynonym':
            return 90
        return 50

    def _in_clique(self, x, cliques):
        for s in cliques:
            if x in s:
                return s
        return set()

    def as_dataframe(self, xg):
        cliques = self.cliques(xg)
        ont = self.merged_ontology
        items = []
        for (x, y, d) in xg.edges(data=True):
            # xg is a non-directional Graph object.
            # to get a deterministic ordering we use the idpair key
            (x, y) = d['idpair']
            (s1, s2) = d['syns']
            (ss1, ss2) = d['simscores']
            clique = self._in_clique(x, cliques)
            #ancs = nx.ancestors(g,x)
            left_label = ont.label(x)
            right_label = ont.label(y)
            if ont.is_obsolete(x) and not left_label.startwith('obsolete'):
                left_label = "obsolete " + left_label
            if ont.is_obsolete(y) and not right_label.startwith('obsolete'):
                right_label = "obsolete " + right_label

            P = self.weighted_axioms(x, y, xg)
            item = {
                'left': x,
                'left_label': left_label,
                'right': y,
                'right_label': right_label,
                'score': d['score'],
                'left_match_type': s1.pred,
                'right_match_type': s2.pred,
                'left_match_val': s1.val,
                'right_match_val': s2.val,
                'left_simscore': ss1,
                'right_simscore': ss2,
                'reciprocal_score': d.get('reciprocal_score', 0),
                'conditional_pr_equiv': d.get('cpr'),
                'pr_subClassOf': P[0],
                'pr_superClassOf': P[1],
                'pr_equivalentTo': P[2],
                'pr_other': P[3],
                'left_novel': d.get('left_novel'),
                'right_novel': d.get('right_novel'),
                'left_consistent': d.get('left_consistent'),
                'right_consistent': d.get('right_consistent'),
                'equiv_clique_size': len(clique)
            }

            items.append(item)

        ix = [
            'left', 'left_label', 'right', 'right_label', 'left_match_type',
            'right_match_type', 'left_match_val', 'right_match_val', 'score',
            'left_simscore', 'right_simscore', 'reciprocal_score',
            'conditional_pr_equiv', 'pr_subClassOf', 'pr_superClassOf',
            'pr_equivalentTo', 'pr_other', 'left_novel', 'right_novel',
            'left_consistent', 'right_consistent', 'equiv_clique_size'
        ]
        df = pd.DataFrame(items, columns=ix)
        df = df.sort_values(["left", "score", "right"])
        return df

    def cliques(self, xg):
        """
        Return all equivalence set cliques, assuming each edge in the xref graph is treated as equivalent,
        and all edges in ontology are subClassOf

        Arguments
        ---------
        xg : Graph
            an xref graph

        Returns
        -------
        list of sets
        """
        g = nx.DiGraph()
        for (x, y) in self.merged_ontology.get_graph().edges():
            g.add_edge(x, y)
        for (x, y) in xg.edges():
            g.add_edge(x, y)
            g.add_edge(y, x)
        return list(strongly_connected_components(g))