コード例 #1
0
 def test_save_and_load(self):
     L = np.array([[0, -1, 0], [0, 1, 0]])
     label_model = LabelModel(cardinality=2, verbose=False)
     label_model.fit(L, n_epochs=1)
     dir_path = tempfile.mkdtemp()
     save_path = dir_path + "label_model"
     label_model.save(save_path)
     label_model.load(save_path)
     shutil.rmtree(dir_path)
コード例 #2
0
    def test_save_and_load(self):
        L = np.array([[0, -1, 0], [0, 1, 1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        original_preds = label_model.predict(L)

        dir_path = tempfile.mkdtemp()
        save_path = dir_path + "label_model.pkl"
        label_model.save(save_path)

        label_model_new = LabelModel(cardinality=2, verbose=False)
        label_model_new.load(save_path)
        loaded_preds = label_model_new.predict(L)
        shutil.rmtree(dir_path)

        np.testing.assert_array_equal(loaded_preds, original_preds)
コード例 #3
0
test_unfired_idx = [i for i,item in enumerate(test_m) if sum(item)==0]
targets_test = test_L[test_fired_idx]

#majority voting using snorkel's majority voting model
maj_preds_test = majority_model.predict(L=test_lsnork[test_fired_idx])
maj_precision_test, maj_recall_test, maj_f1_score_test, maj_support_test = precision_recall_fscore_support(targets_test, maj_preds_test)
maj_accuracy_test = compute_accuracy(maj_support_test, maj_recall_test)

print("precision on *** RULE COVERD TEST SET ***   of MAJORITY VOTING: {}".format(maj_precision_test))
print("recall on *** RULE COVERED TEST SET ***  of MAJORITY VOTING: {}".format(maj_recall_test))
print("f1_score on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_f1_score_test))
print("support on *** RULE COVERED TEST SET ***  of MAJORITY VOTING: {}".format(maj_support_test))
print("accuracy on *** RULE COVERED TEST SET ***   of MAJORITY VOTING: {}".format(maj_accuracy_test))


#Now train snorkels label model
print("Training Snorkel's LabelModel")
label_model = LabelModel(cardinality=num_classes, verbose=True)
label_model.fit(L_train=U_lsnork, n_epochs=1000, lr=0.001, log_freq=100, seed=123)
label_model.save(os.path.join(path_dir,"saved_label_model"))



snork_preds_test = label_model.predict(L=test_lsnork[test_fired_idx])
snork_precision_test, snork_recall_test, snork_f1_score_test, snork_support_test = precision_recall_fscore_support(targets_test, snork_preds_test)
snork_accuracy_test = compute_accuracy(snork_support_test, snork_recall_test)
print("precision on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_precision_test))
print("recall on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_recall_test))
print("f1_score on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_f1_score_test))
print("support on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_support_test))
print("accuracy on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_accuracy_test))
コード例 #4
0
  'd_typ_inpo', # injury or poisoning
  'd_typ_elii', # element, ion or isotope


  'bm25_relevant', 'bm25_score']
'''

df_train = df

Y_train = df_train.bm25_relevant.values

lfs = [
    lf.has_type_diap_medd_or_bhvr, lf.is_doctor_reply, lf.has_votes,
    lf.enity_overlap_jacc, lf.same_author, lf.number_relations_total,
    lf.entity_types
]

applier = PandasLFApplier(lfs)

L_train = applier.apply(df=df_train)

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train,
                n_epochs=20000,
                lr=0.0001,
                log_freq=10,
                seed=2345)
label_model.save("trained_model_ehf.lbm")

print("Finished,")
コード例 #5
0
ファイル: modeler.py プロジェクト: rit-git/snorkel-notebooks
class Modeler:
    def __init__(self,
                 df_train,
                 df_dev,
                 df_valid,
                 df_test,
                 df_heldout,
                 lfs={},
                 label_model=None):
        df_train["seen"] = 0
        self.df_train = df_train.reset_index()
        self.df_dev = df_dev
        self.df_valid = df_valid
        self.df_test = df_test
        self.df_heldout = df_heldout
        #self.Y_train = df_train.label.values
        self.Y_dev = df_dev.label.values
        self.Y_valid = df_valid.label.values
        self.Y_test = df_test.label.values
        self.Y_heldout = df_heldout.label.values

        self.lfs = lfs

        self.L_train = None
        self.L_dev = None
        self.L_valid = None
        self.L_heldout = None
        cardinality = len(df_valid.label.unique())

        # for DEMOing purposes
        self.first_text_indices = [
            1262,  #"check out" "youtube"
            1892,  # I love
            1117,  # url concept
            1706,  # emoji concept
            952,  # "nice"
            971,  # positive concept
            958,  # actually use emoji concept
        ]

        self.count = 0

        if label_model is None:
            self.label_model = LabelModel(cardinality=cardinality,
                                          verbose=True)
        else:
            self.label_model = label_model

        self.vectorizer = CountVectorizer(ngram_range=(1, 2))
        self.vectorizer.fit(df_train.text.tolist())

    def get_lfs(self):
        return list(self.lfs.values())

    def add_lfs(self, new_lfs: dict):
        self.lfs.update(new_lfs)

    def remove_lfs(self, old_lf_ids: list):
        for lf_id in old_lf_ids:
            del self.lfs[lf_id]
        return len(self.lfs)

    def apply_lfs(self):
        applier = PandasLFApplier(lfs=self.get_lfs())
        self.L_train = applier.apply(df=self.df_train)
        self.L_dev = applier.apply(df=self.df_dev)
        self.L_heldout = applier.apply(df=self.df_heldout)
        #self.L_valid = applier.apply(df=self.df_valid)

    def find_duplicate_signature(self):
        label_matrix = np.vstack([self.L_train, self.L_dev])
        seen_signatures = {}
        dupes = {}
        lfs = self.get_lfs()
        signatures = [
            hash(label_matrix[:, i].tostring()) for i in range(len(lfs))
        ]
        for i, s in enumerate(signatures):
            lf = lfs[i]
            if s in seen_signatures:
                dupes[lf.name] = seen_signatures[s]
            else:
                seen_signatures[s] = lf.name
        return dupes

    def lf_examples(self, lf_id, n=5):
        lf = self.lfs[lf_id]
        applier = PandasLFApplier(lfs=[lf])
        L_train = applier.apply(df=self.df_train)
        labeled_examples = self.df_train[L_train != -1]
        samples = labeled_examples.sample(min(n, len(labeled_examples)),
                                          random_state=13)
        return [{"text": t} for t in samples["text"].values]

    def lf_mistakes(self, lf_id, n=5):
        lf = self.lfs[lf_id]
        applier = PandasLFApplier(lfs=[lf])
        L_dev = applier.apply(df=self.df_dev).squeeze()
        labeled_examples = self.df_dev[(L_dev != -1)
                                       & (L_dev != self.df_dev["label"])]
        samples = labeled_examples.sample(min(n, len(labeled_examples)),
                                          random_state=13)
        return [{"text": t} for t in samples["text"].values]

    def fit_label_model(self):
        assert self.L_train is not None

        self.label_model.fit(L_train=self.L_train,
                             n_epochs=1000,
                             lr=0.001,
                             log_freq=100,
                             seed=123)

    def analyze_lfs(self):
        if len(self.lfs) > 0:
            df = LFAnalysis(L=self.L_train, lfs=self.get_lfs()).lf_summary()
            dev_df = LFAnalysis(L=self.L_dev,
                                lfs=self.get_lfs()).lf_summary(Y=self.Y_dev)
            df = df.merge(dev_df,
                          how="outer",
                          suffixes=(" Training", " Dev."),
                          left_index=True,
                          right_index=True)
            df["Weight"] = self.label_model.get_weights()
            df["Duplicate"] = None
            for dupe, OG in self.find_duplicate_signature().items():
                print("Duplicate labeling signature detected")
                print(dupe, OG)
                df.at[dupe, "Duplicate"] = OG

            return df
        return None

    def get_label_model_stats(self):
        result = self.label_model.score(L=self.L_dev,
                                        Y=self.Y_dev,
                                        metrics=["f1", "precision", "recall"])

        probs_train = self.label_model.predict_proba(L=self.L_train)
        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=self.L_train)
        result["training_label_coverage"] = len(probs_train_filtered) / len(
            probs_train)
        result["class_0_ratio"] = (probs_train_filtered[:, 0] >
                                   0.5).sum() / len(probs_train_filtered)
        if len(probs_train_filtered) == 0:
            result["class_0_ratio"] = 0

        return result

    def get_heldout_stats(self):
        if self.L_heldout is not None:
            return self.label_model.score(
                L=self.L_heldout,
                Y=self.Y_heldout,
                metrics=["f1", "precision", "recall"])
        return {}

    def train(self):
        probs_train = self.label_model.predict_proba(L=self.L_train)

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=self.L_train)

        if len(df_train_filtered) == 0:
            print("Labeling functions cover none of the training examples!",
                  file=sys.stderr)
            return {"micro_f1": 0}

        #from tensorflow.keras.utils import to_categorical
        #df_train_filtered, probs_train_filtered = self.df_dev, to_categorical(self.df_dev["label"].values)

        vectorizer = self.vectorizer
        X_train = vectorizer.transform(df_train_filtered.text.tolist())

        X_dev = vectorizer.transform(self.df_dev.text.tolist())
        X_valid = vectorizer.transform(self.df_valid.text.tolist())
        X_test = vectorizer.transform(self.df_test.text.tolist())

        self.keras_model = get_keras_logreg(input_dim=X_train.shape[1])

        self.keras_model.fit(
            x=X_train,
            y=probs_train_filtered,
            validation_data=(X_valid, preds_to_probs(self.Y_valid, 2)),
            callbacks=[get_keras_early_stopping()],
            epochs=20,
            verbose=0,
        )

        preds_test = self.keras_model.predict(x=X_test).argmax(axis=1)

        #return preds_test
        return self.get_stats(self.Y_test, preds_test)

    def get_heldout_lr_stats(self):
        X_heldout = self.vectorizer.transform(self.df_heldout.text.tolist())
        preds_test = self.keras_model.predict(x=X_heldout).argmax(axis=1)
        return self.get_stats(self.Y_heldout, preds_test)

    def get_stats(self, Y_test, preds_test):
        label_classes = np.unique(self.Y_test)
        accuracy = metrics.accuracy_score(Y_test, preds_test)
        precision_0, precision_1 = metrics.precision_score(
            Y_test, preds_test, labels=label_classes, average=None)
        recall_0, recall_1 = metrics.recall_score(Y_test,
                                                  preds_test,
                                                  labels=label_classes,
                                                  average=None)
        test_f1 = metrics.f1_score(Y_test, preds_test, labels=label_classes)

        #recall_0, recall_1 = metrics.precision_recall_fscore_support(self.Y_test, preds_test, labels=label_classes)["recall"]
        return {
            "micro_f1": test_f1,
            "recall_0": recall_0,
            "precision_0": precision_0,
            "accuracy": accuracy,
            "recall_1": recall_1,
            "precision_1": precision_1
        }

    def entropy(self, prob_dist):
        #return(-(L_row_i==-1).sum())
        return (-sum([x * log(x) for x in prob_dist]))

    def save(self, dir_name):
        self.label_model.save(os.path.join(dir_name, 'label_model.pkl'))
        with open(os.path.join(dir_name, 'model_lfs.pkl'), "wb+") as file:
            pickle.dump(self.lfs, file)

    def load(self, dir_name):
        with open(os.path.join(dir_name, 'model_lfs.pkl'), "rb") as file:
            lfs = pickle.load(file)
            label_model = LabelModel.load(
                os.path.join(dir_name, 'label_model.pkl'))
            self.lfs = lfs
            self.label_model = label_model
コード例 #6
0
cui2vec = Cui2Vec().cui2vec

X_gold_sent, X_gold_shortest_path, X_gold_src, X_gold_tgt, X_gold_src_txt, X_gold_tgt_txt, y_gold = data_handler.get_test_data()

X_val_sent, X_val_shortest_path, X_val_src, X_val_tgt, X_val_src_txt, X_val_tgt_txt, y_val = data_handler.get_validation_data()

applier = PandasLFApplier(label_functions.lfs)

df_train = pd.DataFrame(list(zip(*data_handler.get_training_data())), columns=['shortest_path', 'sent', 'src', 'tgt', 'src_txt', 'tgt_txt'])

L_train = applier.apply(df_train)

label_model = LabelModel(cardinality=len(rel_names.rels_txt_to_int), verbose=True)
label_model.fit(L_train, n_epochs=1000, lr=0.01, log_freq=100, seed=123)

label_model.save('./models/LabelModel.model')

train_probs = label_model.predict_proba(L_train)
train_preds = probs_to_preds(train_probs, tie_break_policy='abstain')

df_train = df_train.join(pd.DataFrame({'preds': train_preds, 'probs': list(map(max, train_probs))}))

# -1 to otherwiseRelated
df_train.loc[df_train.preds == -1, 'preds'] = rel_names.rels_txt_to_int['otherwiseRelated']

# Downsample otherwiseRelated
dropNum = len(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']]) - int(df_train['preds'].value_counts().mean())
df_train = df_train.drop(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']].sample(dropNum).index)

cnts = {}
for x in df_train['preds']: