Esempio n. 1
0
    def feature_extraction(self, graph: Graph,
                           stype_score: Dict[int, Optional[float]]):
        node2features = {}
        for node in graph.iter_class_nodes():
            prob_data_nodes = _(node.iter_outgoing_links()) \
                .imap(lambda x: x.get_target_node()) \
                .ifilter(lambda x: x.is_data_node()) \
                .reduce(lambda a, b: a + (stype_score[b.id] or 0), 0)

            similar_nodes = graph.iter_nodes_by_label(node.label)
            minimum_merged_cost = min((get_merged_cost(node, similar_node,
                                                       self.multival_predicate)
                                       for similar_node in similar_nodes))

            node2features[node.id] = [('prob_data_nodes', prob_data_nodes),
                                      ('minimum_merged_cost',
                                       minimum_merged_cost)]
        return node2features
Esempio n. 2
0
    def compute_prob_scope2(
            self,
            sm_id: str,
            g: Graph,
            attr_id: int,
            link2label: Optional[Dict[int, bool]] = None) -> Optional[float]:
        """Give a probability whether mapping of an attribute statistic data constraints

        We can mark some part of graph as false
        """
        dnode = g.get_node_by_id(attr_id)
        dlink = dnode.get_first_incoming_link()
        stype = (dlink.get_source_node().label, dlink.label)
        if stype not in self.prob_count_scope2:
            return None

        slink = dlink.get_source_node().get_first_incoming_link()
        if slink is None:
            # root nodes
            return None

        dnode_parent_type = (slink.get_source_node().label, slink.label)
        if dnode_parent_type not in self.prob_count_scope2[stype] or (
                link2label is not None and not link2label[slink.id]):
            return None
        dnode_stype_idx = self.prob_count_scope2[stype][dnode_parent_type]

        # get other class nodes in the graph that an attr can be mapped to (same semantic type).
        # notice that the constraint is represent as binary-function, so we only keep the class nodes
        # that have another attribute, which is mapped with the same semantic type
        snodes = [
            node for node in g.iter_nodes_by_label(stype[0])
            if node.id != dlink.source_id
        ]
        if len(snodes) == 0:
            # if we don't have any other source nodes (i.e: only one possible mapping)
            return None

        tbl_comparison = self.cached_compared_cols[sm_id]
        another_dnodes = []
        another_dnodes_stype_idx = []
        for snode in snodes:
            # check if this source node have another attribute that is mapped by same semantic type
            for link in snode.iter_outgoing_links():
                if link.label == dlink.label:
                    another_dnode = link.get_target_node()
                    break
            else:
                another_dnode = None

            if another_dnode is not None and (
                    dnode.label, another_dnode.label) in tbl_comparison:
                slink = snode.get_first_incoming_link()
                parent_type = (slink.get_source_node().label, slink.label)
                if parent_type in self.prob_count_scope2[stype] and (
                        link2label is None or link2label[slink.id] is True):
                    # if its parent_type is not in the constraint or its link is false, then we should ignore it
                    another_dnodes.append(another_dnode)
                    another_dnodes_stype_idx.append(
                        self.prob_count_scope2[stype][parent_type])

        # do compare between attr and another_attrs
        if len(another_dnodes) + 1 > len(self.prob_count_scope2[stype]):
            self.logger.warning(
                "There is a model that have more attributes than the inferred constraint.. trace: %s -- %s",
                sm_id, stype)
            return None

        # let's see if we can compare the given attribute with other attributes
        if len(another_dnodes
               ) == 0 or dnode_stype_idx in another_dnodes_stype_idx:
            # how about this case?
            return None

        assert len(self.prob_count_scope2[stype]
                   ) == 2, "Doesn't handle > 2 attributes now..."

        # now we can compare with other attributes
        another_dnode, another_dnode_stype_idx = another_dnodes[
            0], another_dnodes_stype_idx[0]
        result = tbl_comparison[(dnode.label, another_dnode.label)]
        if result is None:
            # the constraint said that we should be able to compare, but we cannot, it should have low probability
            return 1 - self.valid_threshold

        if result:
            # attr > another_attr, attr_stype_idx should > another_attr_stype_idx with high prob.
            if dnode_stype_idx > another_dnode_stype_idx:
                return self.valid_threshold
            return 1 - self.valid_threshold
        else:
            # opposite case of above
            if dnode_stype_idx > another_dnode_stype_idx:
                return 1 - self.valid_threshold
            return self.valid_threshold