Example #1
0
    def getTrainedModel1(self):

        # We build a matrix of LF votes for each comment ticket
        LF_matrix = self.make_Ls_matrix(self.LF_set['comments'], self.LFs)

        # Get true labels for LF set
        Y_LF_set = np.array(self.LF_set['resolution'])

        display(
            lf_summary(sparse.csr_matrix(LF_matrix),
                       Y=Y_LF_set,
                       lf_names=self.LF_names.values()))

        print("label coverage: " + label_coverage(LF_matrix))

        mv = MajorityLabelVoter()
        Y_train_majority_votes = mv.predict(LF_matrix)
        print("classification report:\n" +
              classification_report(Y_LF_set, Y_train_majority_votes))

        Ls_train = self.make_Ls_matrix(self.train, self.LFs)

        # You can tune the learning rate and class balance.
        model = LabelModel(k=2, seed=123)
        trainer = model.train_model(Ls_train,
                                    n_epochs=2000,
                                    print_every=1000,
                                    lr=0.0001,
                                    class_balance=np.array([0.2, 0.8]))

        Y_train = model.predict(Ls_train) + Y_LF_set

        print('Trained Label Model Metrics:')
        scores = model.score((Ls_train[1], Y_train[1]),
                             metric=['accuracy', 'precision', 'recall', 'f1'])
        print(scores)

        return trainer, Y_train
Example #2
0
    def getTrainedModel2(self):
        # Apply the LFs to the unlabeled training data
        applier = PandasLFApplier(self.LFs)
        L_train = applier.apply(self.train['comments'])

        # Train the label model and compute the training labels
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
        self.train['resolution'] = label_model.predict(
            L=L_train, tie_break_policy="abstain")
        df_train = self.train[self.train.resolution != self.ABSTAIN]

        train_text = df_train.comments.tolist()
        X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text)

        clf = LogisticRegression(solver="lbfgs")
        clf.fit(X=X_train, y=df_train.resolution.values)
        prob = clf.predict_proba(self.test)

        if torch.cuda.is_available():
            device = 'cuda'
        else:
            device = 'cpu'
        end_model = EndModel([1000, 10, 2], seed=123, device=device)

        end_model.train_model(
            (self.train['comments'], self.test['comments']),
            valid_data=(self.train['resolution'], self.test['comments']),
            lr=0.01,
            l2=0.01,
            batch_size=256,
            n_epochs=5,
            checkpoint_metric='accuracy',
            checkpoint_metric_mode='max')

        return prob
Example #3
0
               candidate_dfs['dev'].curated_dsh,
               model_type='curve',
               figsize=(12, 7),
               plot_title="Disease Associates Gene Dev PRC",
               metric='PR',
               font_size=16)

# In[21]:

label_model = LabelModel(k=2, seed=100)
label_model.train_model(validation_data[1][0],
                        n_epochs=1000,
                        verbose=False,
                        lr=0.01,
                        l2=2.067)
dev_predictions = convert_labels(label_model.predict(validation_data[1][1]),
                                 'categorical', 'onezero')
dev_marginals = label_model.predict_proba(validation_data[1][1])[:, 0]

# In[22]:

plt.rcParams.update({'font.size': 16})
plt.figure(figsize=(10, 6))
plot_predictions_histogram(dev_predictions,
                           candidate_dfs['dev'].curated_dsh.astype(int).values,
                           title="Prediction Histogram for Dev Set")

# In[23]:

confusion_matrix(
    convert_labels(candidate_dfs['dev'].curated_dsh.values, 'onezero',
                        log_train_every=50)

score = label_model.score((Ls[1], Ys[1]))

print('Trained Label Model Metrics:')
scores = label_model.score((Ls[1], Ys[1]),
                           metric=['accuracy', 'precision', 'recall', 'f1'])

mv = MajorityLabelVoter(seed=123)
print('Majority Label Voter Metrics:')
scores = mv.score((Ls[1], Ys[1]),
                  metric=['accuracy', 'precision', 'recall', 'f1'])

Y_train_ps = label_model.predict_proba(Ls[0])

Y_dev_p = label_model.predict(Ls[1])
"""
mv2 = MajorityClassVoter()
mv2.train_model(np.asarray(new_balance))
"""

#=np.asarray(new_balance))

#Y_baseline = mv2.predict(Ls[2])
pickling_on2 = open(
    "data_encompassing/ar/ar_baseline_{}{}".format(flag0, flag), "wb")
pickle.dump(Y_baseline, pickling_on2)
print(Y_baseline)

# baseline majority:
"""