Exemple #1
0
class BaselineStruct(BaseArgumentMixin):
    def __init__(self, alpha_link, alpha_prop, l1_ratio, exact_test=False):
        self.alpha_link = alpha_link
        self.alpha_prop = alpha_prop
        self.l1_ratio = l1_ratio
        self.compat_features = False
        self.exact_test = exact_test

    def initialize_labels(self, y_props_flat, y_links_flat):
        self.prop_encoder_ = LabelEncoder().fit(y_props_flat)
        self.link_encoder_ = LabelEncoder().fit(y_links_flat)

        self.n_prop_states = len(self.prop_encoder_.classes_)
        self.n_link_states = len(self.link_encoder_.classes_)

    def fit(self, X_link, y_link, X_prop, y_prop):
        self.initialize_labels(y_prop, y_link)
        y_link = self.link_encoder_.transform(y_link)
        y_prop = self.prop_encoder_.transform(y_prop)

        self.link_clf_ = SAGAClassifier(loss='smooth_hinge',
                                        penalty='l1',
                                        tol=1e-4,
                                        max_iter=500,
                                        random_state=0,
                                        verbose=0)

        self.prop_clf_ = clone(self.link_clf_)

        alpha_link = self.alpha_link * (1 - self.l1_ratio)
        beta_link = self.alpha_link * self.l1_ratio
        sw = compute_sample_weight('balanced', y_link)
        self.link_clf_.set_params(alpha=alpha_link, beta=beta_link)
        self.link_clf_.fit(X_link, y_link, sample_weight=sw)

        alpha_prop = self.alpha_prop * (1 - self.l1_ratio)
        beta_prop = self.alpha_prop * self.l1_ratio
        self.prop_clf_.set_params(alpha=alpha_prop, beta=beta_prop)
        self.prop_clf_.fit(X_prop, y_prop)
        return self

    def decision_function(self, X_link, X_prop, docs):

        link_offsets = np.cumsum([len(doc.features) for doc in docs])
        y_link_flat = self.link_clf_.decision_function(X_link)

        y_link_marg = np.zeros(
            (len(y_link_flat), len(self.link_encoder_.classes_)))
        link_on, = self.link_encoder_.transform([True])
        y_link_marg[:, link_on] = y_link_flat.ravel()

        Y_link = [
            y_link_marg[start:end]
            for start, end in zip(np.append(0, link_offsets), link_offsets)
        ]

        prop_offsets = np.cumsum([len(doc.prop_features) for doc in docs])
        y_prop_marg = self.prop_clf_.decision_function(X_prop)
        Y_prop = [
            y_prop_marg[start:end]
            for start, end in zip(np.append(0, prop_offsets), prop_offsets)
        ]

        Y_pred = []
        for y_link, y_prop in zip(Y_link, Y_prop):
            Y_pred.append(DocLabel(y_prop, y_link))

        assert len(Y_pred) == len(docs)

        return Y_pred

    def fast_decode(self, Y_marg, docs, constraints):
        if constraints:
            Y_pred = []
            zero_compat = np.zeros(
                (self.n_prop_states, self.n_prop_states, self.n_link_states))

            for doc, y in zip(docs, Y_marg):
                potentials = (y.nodes, y.links, zero_compat, [], [], [])
                y_decoded, _ = self._inference(doc,
                                               potentials,
                                               relaxed=False,
                                               exact=self.exact_test,
                                               constraints=constraints)
                Y_pred.append(y_decoded)

        else:
            Y_pred = [
                self._round(y.nodes, y.links, inverse_transform=True)
                for y in Y_marg
            ]
        return Y_pred

    def predict(self, X_link, X_prop, docs, constraints=""):
        Y_marg = self.decision_function(X_link, X_prop, docs)
        return self.fast_decode(Y_marg, docs, constraints)
class BaselineStruct(BaseArgumentMixin):
    def __init__(self, alpha_link, alpha_prop, l1_ratio):
        self.alpha_link = alpha_link
        self.alpha_prop = alpha_prop
        self.l1_ratio = l1_ratio
        self.compat_features = False

    def initialize_labels(self, y_props_flat, y_links_flat):
        self.prop_encoder_ = LabelEncoder().fit(y_props_flat)
        self.link_encoder_ = LabelEncoder().fit(y_links_flat)

        self.n_prop_states = len(self.prop_encoder_.classes_)
        self.n_link_states = len(self.link_encoder_.classes_)

    def fit(self, X_link, y_link, X_prop, y_prop):
        self.initialize_labels(y_prop, y_link)
        y_link = self.link_encoder_.transform(y_link)
        y_prop = self.prop_encoder_.transform(y_prop)

        self.link_clf_ = SAGAClassifier(loss='smooth_hinge',
                                        penalty='l1',
                                        tol=1e-4,
                                        max_iter=500,
                                        random_state=0,
                                        verbose=0)

        self.prop_clf_ = clone(self.link_clf_)

        alpha_link = self.alpha_link * (1 - self.l1_ratio)
        beta_link = self.alpha_link * self.l1_ratio
        sw = compute_sample_weight('balanced', y_link)
        self.link_clf_.set_params(alpha=alpha_link, beta=beta_link)
        self.link_clf_.fit(X_link, y_link, sample_weight=sw)

        alpha_prop = self.alpha_prop * (1 - self.l1_ratio)
        beta_prop = self.alpha_prop * self.l1_ratio
        self.prop_clf_.set_params(alpha=alpha_prop, beta=beta_prop)
        self.prop_clf_.fit(X_prop, y_prop)
        return self

    def decision_function(self, X_link, X_prop, docs):

        link_offsets = np.cumsum([len(doc.features) for doc in docs])
        y_link_flat = self.link_clf_.decision_function(X_link)

        y_link_marg = np.zeros(
            (len(y_link_flat), len(self.link_encoder_.classes_)))
        link_on, = self.link_encoder_.transform([True])
        y_link_marg[:, link_on] = y_link_flat.ravel()

        Y_link = [
            y_link_marg[start:end]
            for start, end in zip(np.append(0, link_offsets), link_offsets)
        ]

        prop_offsets = np.cumsum([len(doc.prop_features) for doc in docs])
        y_prop_marg = self.prop_clf_.decision_function(X_prop)
        Y_prop = [
            y_prop_marg[start:end]
            for start, end in zip(np.append(0, prop_offsets), prop_offsets)
        ]

        Y_pred = []
        for y_link, y_prop in zip(Y_link, Y_prop):
            Y_pred.append(DocLabel(y_prop, y_link))

        assert len(Y_pred) == len(docs)

        return Y_pred