Beispiel #1
0
def labelmodel_predict(L_train, y_true, L_test, return_probs=False, **kwargs):
    kwargs.setdefault('n_epochs', 500)
    kwargs.setdefault('log_freq', 100)

    from snorkel.labeling.model import LabelModel
    n = len(set(y_true[~y_true.isna()].values))
    log.info('y_true values: %s', set(y_true[~y_true.isna()].values))
    label_model = LabelModel(cardinality=n, verbose=True)

    L_train_val = set(L_train.values.flatten())
    y_true_val = set(y_true.values.flatten())
    log.info('Values in L_train but not y_true: %s', L_train_val - y_true_val)
    log.info('Values in y_true but not L_train: %s', y_true_val - L_train_val)

    L_train, Y_dev = to_numbered(L_train, y_true)

    log.info('L_train values: %s, %s', set(L_train.flatten()), type(L_train))
    log.info('Y_dev values: %s, %s', set(Y_dev.flatten()), type(Y_dev))
    log.info('kwargs: %s', kwargs)

    label_model.fit(L_train=L_train, Y_dev=Y_dev[Y_dev != -1], **kwargs)

    y_pred = label_model.predict(to_numbered(L_test, y_true)[0],
                                 return_probs=return_probs)

    if return_probs:
        y_pred, y_score = y_pred
    y_pred = from_numbered(L_test, y_true, y_pred)
    return (y_pred, y_score) if return_probs else y_pred
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}")
Beispiel #3
0
    def test_sparse_and_regular_make_same_probs(self) -> None:
        """Test the LabelModel's estimate of P and Y on a simple synthetic dataset."""
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(
            self.known_dimensions.num_examples,
            self.known_dimensions.num_functions,
            self.known_dimensions.num_classes,
        )
        example_event_lists: List[ExampleEventListOccurence] = []

        for example_num, example in enumerate(L):
            event_list = []
            for func_id, cls_id in enumerate(example):
                if (cls_id) > -1:
                    event_id = func_id * self.known_dimensions.num_classes + cls_id
                    event_list.append(event_id)
            example_event_lists.append((ExampleEventListOccurence(event_list)))

        sparse_model = SparseExampleEventListLabelModel()
        sparse_model.fit_from_sparse_example_event_list(
            example_event_list=example_event_lists,
            known_dimensions=self.known_dimensions,
            n_epochs=200,
            lr=0.01,
            seed=123,
        )
        label_model = LabelModel(cardinality=self.known_dimensions.num_classes)
        label_model.fit(L, n_epochs=200, lr=0.01, seed=123)
        P_lm = label_model.get_conditional_probs()
        P_slm = sparse_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(
            P_slm,
            P_lm,
        )
Beispiel #4
0
    def train(self, dataset):
        # Apply labeler functions to training set
        lfs_applier = PandasLFApplier(lfs=self.lfs)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            lfs_train = lfs_applier.apply(df=dataset)

        # Build probabilistic label model
        label_model = LabelModel(cardinality=3, verbose=True)
        label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42)
        label_probs = label_model.predict_proba(lfs_train)

        # Filter unlabeled data points
        df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset,
                                                                 y=label_probs,
                                                                 L=lfs_train)

        # Featurize data using scikit
        self.vectorizer = CountVectorizer(ngram_range=(1, 5))
        dataset_train = self.vectorizer.fit_transform(
            df_filtered.sentence.tolist())

        # Replace probabilistic labels with most likely label
        preds_filtered = probs_to_preds(probs=probs_filtered)

        # Train scikit model
        self.model = LogisticRegression(C=1e3,
                                        solver="liblinear",
                                        multi_class='auto')
        self.model.fit(X=dataset_train, y=preds_filtered)
Beispiel #5
0
def get_L_final_filter(L_train, method='model'):
    L_final = []

    if len(L_train[0]) < 3:
        method = 'absolute'
    else:
        method = 'model'

    ## TEMPORARY MEASURE
    method = 'absolute'
    ##

    if method == 'absolute':
        ## Absolute Method: Any 'irrelevant' keywords matched will be flagged as irrelevant
        for array in L_train:
            if 0 in array:
                L_final.append(0)
            else:
                L_final.append(1)
    else:
        ## Label Model
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        L_final = label_model.predict(L=L_train,return_probs=False)

    return L_final
 def test_optimizer(self):
     L = np.array([[0, -1, 0], [0, 1, 0]])
     label_model = LabelModel(cardinality=2, verbose=False)
     label_model.fit(L, n_epochs=1, optimizer="sgd")
     label_model.fit(L, n_epochs=1, optimizer="adam")
     label_model.fit(L, n_epochs=1, optimizer="adamax")
     with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"):
         label_model.fit(L, n_epochs=1, optimizer="bad_opt")
    def test_set_mu_eps(self):
        mu_eps = 0.0123

        # Construct a label matrix such that P(\lambda_1 = 0 | Y) = 0.0, so it will hit
        # the mu_eps floor
        L = np.array([[1, 1, 1], [1, 1, 1]])
        label_model = LabelModel(verbose=False)
        label_model.fit(L, mu_eps=mu_eps)
        self.assertAlmostEqual(label_model.get_conditional_probs()[0, 1, 0], mu_eps)
    def test_score(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, -1, 1]))

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)

        L = np.array([[1, 0, 1], [1, 0, 1]])
        label_model = self._set_up_model(L)
        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))

        results = label_model.score(L, Y=np.array([0, 1]))
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)

        results = label_model.score(L=L,
                                    Y=np.array([1, 0]),
                                    metrics=["accuracy", "f1"])
        results_expected = dict(accuracy=0.5, f1=2 / 3)
        self.assertEqual(results, results_expected)
Beispiel #9
0
def main():
    lfs = [lf_contains_link, lf_contains_co, lf_contains_sub]
    baseApp = LFApplier(lfs)
    labels = baseApp.apply(src)
    print(labels)
    print(LFAnalysis(labels, lfs).lf_summary())
    buckets = get_label_buckets(labels[:, 0], labels[:, 1])
    print(buckets)

    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(labels, n_epochs=500, log_freq=50, seed=123)
    pred_labels = label_model.predict(L=labels, tie_break_policy="abstain")
    print(pred_labels)
Beispiel #10
0
def generative_model(L_train, n_epochs=500, print_every=100):
    model = LabelModel(cardinality=2)

    logger.info("Training generative model...")
    model.fit(L_train=L_train,
              n_epochs=n_epochs,
              seed=1234,
              log_freq=print_every)
    logger.info("Done.")

    marginals = model.predict_proba(L_train)

    return marginals
def calculate_metrics(
    label_model: LabelModel,
    dataset_name: str,
    true_labels: np.ndarray,
    save_to: AbsolutePath,
) -> Dict[str, float]:
    """
    >>> from collections import namedtuple; import tempfile
    >>> def mocked_predictions(l,return_probs,tie_break_policy): return np.array([1, 0, 1]), np.array([[0.1, 0.9], [0.8, 0.2], [0.25, 0.75]])
    >>> def mocked_scores(L,Y,tie_break_policy,metrics):
    ...     return {"f1": 1.0} if metrics == ['f1'] else {"roc_auc": 0.78}
    >>> lm = namedtuple('LM', ['predict', 'score'])(mocked_predictions, mocked_scores)
    >>> with tempfile.TemporaryDirectory() as tmpdirname:
    ...     np.ndarray([]).dump(f"{tmpdirname}/heuristic_matrix_test_set.pkl")
    ...     calculate_metrics(lm, "test_set", np.array([1, 1, 0]), Path(tmpdirname))
    {'label_model_accuracy_test_set': 0.333, 'label_model_auc_test_set': 0.78, 'label_model_f1_test_set': 1.0, 'label_model_mse_test_set': 0.404}
    >>> with tempfile.TemporaryDirectory() as tmpdirname:
    ...     np.ndarray([]).dump(f"{tmpdirname}/heuristic_matrix_test_set.pkl")
    ...     calculate_metrics(lm, "test_set", np.array([0, 1, 0]), Path(tmpdirname))
    {'label_model_accuracy_test_set': 0.0, 'label_model_auc_test_set': 0.78, 'label_model_f1_test_set': 1.0, 'label_model_mse_test_set': 0.671}
    """
    lines = np.load(str(save_to / f"heuristic_matrix_{dataset_name}.pkl"),
                    allow_pickle=True)

    tie_break_policy = "random"

    Y_pred, Y_prob = label_model.predict(lines,
                                         return_probs=True,
                                         tie_break_policy=tie_break_policy)

    try:
        auc = label_model.score(L=lines,
                                Y=true_labels,
                                tie_break_policy="random",
                                metrics=["roc_auc"])["roc_auc"]
        auc = round(auc, 3)
    except ValueError:
        auc = "n/a"
    f1 = label_model.score(L=lines,
                           Y=true_labels,
                           tie_break_policy="random",
                           metrics=["f1"])["f1"]
    accuracy = sum(Y_pred == true_labels) / float(len(Y_pred))
    mse = np.mean((Y_prob[:, 1] - true_labels)**2)

    return {
        f"label_model_accuracy_{dataset_name}": round(accuracy, 3),
        f"label_model_auc_{dataset_name}": auc,
        f"label_model_f1_{dataset_name}": round(f1, 3),
        f"label_model_mse_{dataset_name}": round(mse, 3),
    }
Beispiel #12
0
    def test_model_loss(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel(cardinality=2, verbose=False)

        label_model.fit(L, n_epochs=1)
        init_loss = label_model._loss_mu().item()

        label_model.fit(L, n_epochs=10)
        next_loss = label_model._loss_mu().item()

        self.assertLessEqual(next_loss, init_loss)

        with self.assertRaisesRegex(Exception, "Loss is NaN."):
            label_model.fit(L, n_epochs=10, lr=1e8)
Beispiel #13
0
    def test_optimizer_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, optimizer="sgd", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.SGD)

        label_model.fit(L, optimizer="adam", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adam)

        label_model.fit(L, optimizer="adamax", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adamax)

        with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"):
            label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
Beispiel #14
0
def test_save_with_conda_yaml(tmp_path: Path, setup_common_components: Dict):
    """Test if a model can be saved with a conda yaml file."""
    kwargs = setup_common_components
    labeler = Labeler(None, [PartTemp])
    # Mock the get_keys()
    labeler.get_keys = MagicMock(return_value=[LabelKey(name="key1")])
    lfs = [[LF_storage_row]]
    label_models = [LabelModel()]

    # Create a conda yaml file
    with open(tmp_path.joinpath("my_conda.yaml"), "w") as f:
        yaml.dump(_get_default_conda_env(), f)

    # Save a model with a conda yaml file.
    save_model(
        HardwareFonduerModel(),
        os.path.join(tmp_path, artifact_path),
        **kwargs,
        conda_env=tmp_path.joinpath("my_conda.yaml"),
        code_paths=[
            "tests"
        ],  # pass a directory name to preserver the directory hierarchy
        model_type="label",
        labeler=labeler,
        lfs=lfs,
        label_models=label_models,
    )
    # Your conda yaml file is saved as "conda.yaml".
    assert os.path.exists(os.path.join(tmp_path, artifact_path, "conda.yaml"))
Beispiel #15
0
def get_snorkel_labels(frame_to_train, pkl_name):
    print(
        "==============================Labeling is now started======================================="
    )
    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=frame_to_train)
    date_parser_coverage, currency_coverage,\
    zipcode_coverage,state_coverage,\
    quntity_coverage,phonenumber_coverage,SSN_coverage,\
    first_name_coverage,last_name_coverage,percent_coverge= (L_train != ABSTAIN).mean(axis=0)
    frame_to_train.rename(columns={
        "word_id": "word_tokens",
        "text": "ocr",
        "label_number": "preds"
    },
                          inplace=True)
    print(
        "==============================Labeling is now complete======================================="
    )
    print(
        "==============================Summary Stats=================================================="
    )
    print(f"date_parser_coverage: {date_parser_coverage * 100:.1f}%")
    print(f"currency_coverage: {currency_coverage * 100:.1f}%")
    print(f"zipcode_coverage: {zipcode_coverage * 100:.1f}%")
    print(f"state_coverage: {state_coverage * 100:.1f}%")
    print(f"quntity_coverage: {quntity_coverage * 100:.1f}%")
    print(f"phonenumber_coverage: {phonenumber_coverage * 100:.1f}%")
    print(f"SSN_coverage: {SSN_coverage * 100:.1f}%")
    print(f"first_name_coverage: {first_name_coverage * 100:.1f}%")
    print(f"last_name_coverage: {last_name_coverage * 100:.1f}%")
    #print(f"alpha_number_coverage: {alpha_number_coverage * 100:.1f}%")
    #     lol= f"{pkl_name}.pkl"
    #     print("File name I got:", lol)
    #     print(f"percent_coverage: {percent_coverge * 100:.1f}%")
    #     with open(lol, 'rb') as f:
    #         label_model = pickle.load(f)
    label_model = LabelModel(cardinality=15, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
    frame_to_train["label_number"] = label_model.predict(
        L=L_train, tie_break_policy="abstain")
    frame_to_train.label_number.fillna(0, inplace=True)
    frame_to_train['pred_names'] = frame_to_train.label_number.map(inv_et_dct)
    return frame_to_train
    #dataset_df = pd.DataFrame()
    return frame_to_train
Beispiel #16
0
    def test_scheduler_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, lr_scheduler="constant", n_epochs=1)
        self.assertIsNone(label_model.lr_scheduler)

        label_model.fit(L, lr_scheduler="linear", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR)

        label_model.fit(L, lr_scheduler="exponential", n_epochs=1)
        self.assertIsInstance(
            label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR
        )

        label_model.fit(L, lr_scheduler="step", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
def test():
    patient = 'maskedLiverIso.nii'
    filenames = ['antiga.nii', 'jerman.nii']
    labeling_functions = [li_thresholding, otsu_thresholding]
    path = 'F:/Deep Learning/Data/snorkel/*'

    dataset = []

    size = 0
    for dir_path in sorted(glob(str(Path(path))), key=sorting):
        ip: Path = Path(dir_path) / patient

        arr, im = io_load_image(str(ip))
        shape = arr.shape
        arr = arr.flatten()

        labels = []
        for fn in filenames:
            for func in labeling_functions:
                i = Path(dir_path) / fn
                array, image = io_load_image(str(i))
                array = func(array.flatten())
                size += array.shape[-1]
                labels.append(array)

        dataset.append([im, arr, labels, ip.parts[-2], shape])
    array_reshape = (size // len(filenames) // len(labeling_functions),
                     len(filenames) * len(labeling_functions))
    print(array_reshape)

    lab: np.ndarray = np.zeros((size), dtype='float16').reshape(array_reshape)
    print(size, lab.shape)
    s = 0
    for data in dataset:
        _, _, label, _, _ = data
        T: np.ndarray = np.array(label).T
        si = T.shape[0]
        lab[s:s + si, :] = T
        s += si

    LM: LabelModel = LabelModel(cardinality=2, verbose=True, device='cuda')
    LM.fit(lab,
           seed=12345,
           log_freq=1,
           n_epochs=100,
           class_balance=[0.985, 0.015])

    s = 0
    for data in dataset:
        im, arr, label, fn, shape = data
        print(fn)
        T: np.ndarray = np.array(label).T
        p: np.ndarray = LM.predict(T)
        p = p.reshape(shape)
        p = getLargestCC(p)
        p[p > 0] = 255
        p = np.array(p, dtype='uint8')
        io_save_image('temp/' + fn + '.nii', p, im)
Beispiel #18
0
    def test_augmented_L_construction(self):
        # 5 LFs
        n = 3
        m = 5
        k = 2
        L = np.array([[0, 0, 0, 1, 0], [0, 1, 1, 0, -1], [0, 0, 0, 0, -1]])
        L_shift = L + 1
        lm = LabelModel(cardinality=k, verbose=False)
        lm._set_constants(L_shift)
        lm._create_tree()
        L_aug = lm._get_augmented_label_matrix(L_shift, higher_order=True)

        # Should have 10 columns:
        # - 5 * 2 = 10 for the sources
        self.assertEqual(L_aug.shape, (3, 10))

        # 13 total nonzero entries
        self.assertEqual(L_aug.sum(), 13)

        # Next, check the singleton entries
        for i in range(n):
            for j in range(m):
                if L_shift[i, j] > 0:
                    self.assertEqual(L_aug[i, j * k + L_shift[i, j] - 1], 1)

        # Finally, check the clique entries
        # Singleton clique 1
        self.assertEqual(len(lm.c_tree.node[1]["members"]), 1)
        j = lm.c_tree.node[1]["start_index"]
        self.assertEqual(L_aug[0, j], 1)

        # Singleton clique 2
        self.assertEqual(len(lm.c_tree.node[2]["members"]), 1)
        j = lm.c_tree.node[2]["start_index"]
        self.assertEqual(L_aug[0, j + 1], 0)
Beispiel #19
0
    def test_sparse_and_regular_make_same_objective(self):
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(
            self.known_dimensions.num_examples,
            self.known_dimensions.num_functions,
            self.known_dimensions.num_classes,
        )
        sparse_event_occurence: List[EventCooccurence] = []
        label_model = LabelModel(cardinality=self.known_dimensions.num_classes)
        label_model._set_constants(L)
        L_shift = L + 1
        label_model_lind = label_model._create_L_ind(L_shift)
        co_oc_matrix = label_model_lind.T @ label_model_lind
        for a_id, cols in enumerate(co_oc_matrix):
            for b_id, freq in enumerate(cols):
                sparse_event_occurence.append(
                    EventCooccurence(a_id, b_id, frequency=freq))

        sparse_model = SparseEventPairLabelModel()
        sparse_model._set_constants(known_dimensions=self.known_dimensions)

        sparse_model_objective = sparse_model._prepare_objective_from_sparse_event_cooccurence(
            known_dimensions=self.known_dimensions,
            sparse_event_occurence=sparse_event_occurence,
        )
        self.assertEqual(label_model.n, sparse_model.n)
        self.assertEqual(label_model.m, sparse_model.m)
        self.assertEqual(label_model.cardinality, sparse_model.cardinality)
        label_model._generate_O(L_shift, )
        label_model_O = label_model.O.detach().numpy()
        np.testing.assert_almost_equal(label_model_O, sparse_model_objective)
Beispiel #20
0
    def test_loss(self):
        L = np.array([[0, -1, 0], [0, 1, -1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        label_model.mu = nn.Parameter(label_model.mu_init.clone() + 0.05)

        # l2_loss = l2*M*K*||mu - mu_init||_2 = 3*2*(0.05^2) = 0.03
        self.assertAlmostEqual(label_model._loss_l2(l2=1.0).item(), 0.03)
        self.assertAlmostEqual(label_model._loss_l2(l2=np.ones(6)).item(), 0.03)

        # mu_loss = ||O - \mu^T P \mu||_2 + ||\mu^T P - diag(O)||_2
        self.assertAlmostEqual(label_model._loss_mu().item(), 0.675, 3)
Beispiel #21
0
    def train_model(self,
                    df_train: pd.DataFrame,
                    application_area_lfs: list,
                    analysis_path: str = "output",
                    label_output_path: str = "labels.jsonl",
                    save_model_path: str = None):
        """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points

        :param df_train: The training data for the model
        :type df_train: pd.DataFrame
        :param application_area_lfs: A list of labeling functions to use in training the Label Model
        :type application_area_lfs: list
        :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output`
        :type analysis_path: str, optional
        :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl"
        :type label_output_path: str, optional
        :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved
        :type save_model_path: str, optional
        """
        file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        applier = PandasLFApplier(lfs=application_area_lfs)
        L_train = applier.apply(df=df_train)

        model = LabelModel(cardinality=2, verbose=True)
        model.fit(L_train=L_train, n_epochs=800, log_freq=100)
        if (save_model_path is not None):
            model.save(save_model_path)

        int_labels, prob_labels = model.predict(L=L_train,
                                                return_probs=True,
                                                tie_break_policy="abstain")
        probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=prob_labels, L=L_train)

        int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=int_labels, L=L_train)
        # write out both labels. In the probability outputs, p_rel is the second probability listed
        assert list(probs_df_train_filtered["paperid"]) == list(
            int_df_train_filtered["paperid"])
        with open(f"{label_output_path}", mode="w") as out:
            for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]):
                out.write(
                    json.dumps({
                        "id": paper_id,
                        # cast to int and float to get rid of nonserializable numpy types
                        "is_rel": int(int_train_filtered[idx]),
                        "p_rel": float(probs_train_filtered[idx][1])
                    }) + "\n")

        # output LF analysis to csv file sorted by coverage
        lf_analysis = LFAnalysis(L=L_train,
                                 lfs=application_area_lfs).lf_summary()
        with open(
                f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv",
                "w") as outfile:
            lf_analysis = lf_analysis.sort_values("Coverage")
            lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
Beispiel #22
0
    def test_mv_default(self):
        # less than 2 LFs have overlaps
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 0]))

        # less than 2 LFs have conflicts
        L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 1]))
Beispiel #23
0
    def test_class_balance(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        # Test class balance
        Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1])
        label_model._set_class_balance(class_balance=None, Y_dev=Y_dev)
        np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4]))

        class_balance = np.array([0.0, 1.0])
        with self.assertRaisesRegex(ValueError, "Class balance prior is 0"):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        class_balance = np.array([0.0])
        with self.assertRaisesRegex(ValueError, "class_balance has 1 entries."):
            label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev)

        Y_dev_one_class = np.array([0, 0, 0])
        with self.assertRaisesRegex(
            ValueError, "Does not match LabelModel cardinality"
        ):
            label_model._set_class_balance(class_balance=None, Y_dev=Y_dev_one_class)
Beispiel #24
0
    def test_get_weight(self):
        # set up L matrix
        true_accs = [0.95, 0.6, 0.7, 0.55, 0.8]
        coverage = [1.0, 0.8, 1.0, 1.0, 1.0]
        L = -1 * np.ones((1000, len(true_accs)))
        Y = np.zeros(1000)

        for i in range(1000):
            Y[i] = 1 if np.random.rand() <= 0.5 else 0
            for j in range(5):
                if np.random.rand() <= coverage[j]:
                    L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else
                               np.abs(Y[i] - 1))

        label_model = LabelModel(cardinality=2)
        label_model.fit(L, n_epochs=1000, seed=123)

        accs = label_model.get_weights()
        for i in range(len(accs)):
            true_acc = true_accs[i]
            self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
Beispiel #25
0
    def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = ([f] + [
            get_positive_labeling_function(divisor) for divisor in range(2, 9)
        ] + [
            get_negative_labeling_function(divisor) for divisor in range(2, 9)
        ])
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape,
                         (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
Beispiel #26
0
def label_dataset(
    task: Task,
    dataset: Dataset,
    path_config: Optional[PathConfig] = None,
    debug: bool = False,
):
    path_config = path_config or PathConfig.load()

    applied_heuristics_df = pd.read_pickle(
        str(path_config.generated / task.name /
            f"heuristic_matrix_{dataset.name}.pkl"))

    label_model = LabelModel()
    label_model.load(str(path_config.generated / task.name /
                         "label_model.pkl"))
    df = dataset.load()
    df_labeled = do_labeling(label_model, applied_heuristics_df.to_numpy(), df,
                             task.labels)

    if debug:
        for (
                heuristic_name,
                applied_heuristic_series,
        ) in applied_heuristics_df.iteritems():
            applied_heuristics_df[
                heuristic_name] = applied_heuristic_series.map({
                    0: heuristic_name,
                    1: heuristic_name,
                    -1: ""
                })
        col_lfs = applied_heuristics_df.apply(
            lambda row: ";".join([elm for elm in row if elm]), axis=1)
        df_labeled["lfs"] = col_lfs

    labeled_data_path = path_config.labeled_data / task.name
    if not labeled_data_path.exists():
        labeled_data_path.mkdir(parents=True)
    target_file = labeled_data_path / f"{dataset.name}.labeled.csv"
    df_labeled.to_csv(target_file, index=False)
    print(f"Labeled dataset has been written to {target_file}.")
Beispiel #27
0
    def test_label_model_sparse(self) -> None:
        """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset.

        This tests the common setting where LFs abstain most of the time, which can
        cause issues for example if parameter clamping set too high (e.g. see Issue
        #1422).
        """
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n,
                                               self.m,
                                               self.cardinality,
                                               abstain_multiplier=1000.0)

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=1000, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels *only on non-abstained data points*
        Y_pred = label_model.predict(L, tie_break_policy="abstain")
        (idx, ) = np.where(Y_pred != -1)
        acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx)
        self.assertGreaterEqual(acc, 0.65)

        # Make sure that we don't output abstain when an LF votes, per issue #1422
        self.assertEqual(len(idx),
                         np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
Beispiel #28
0
    def test_L_form(self):
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, 1, -1], [-1, 1, -1], [1, -1, -1], [-1, 1, -1]])
        label_model._set_constants(L)
        self.assertEqual(label_model.n, 4)
        self.assertEqual(label_model.m, 3)

        L = np.array([[-1, 0, 1], [-1, 0, 2], [0, -1, 2], [-1, 0, -1]])
        with self.assertRaisesRegex(ValueError, "L_train has cardinality"):
            label_model.fit(L, n_epochs=1)

        L = np.array([[0, 1], [1, 1], [0, 1]])
        with self.assertRaisesRegex(ValueError, "L_train should have at least 3"):
            label_model.fit(L, n_epochs=1)
Beispiel #29
0
    def test_warmup(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        lr_scheduler_config = {"warmup_percentage": 3 / 5}
        label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5)
        self.assertEqual(label_model.warmup_steps, 3)

        with self.assertRaisesRegex(ValueError, "LabelModel does not support"):
            lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"}
            label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
Beispiel #30
0
    def train(self):
        '''
        Train the logistic regression discriminative model
        '''
        # We pull out the label vectors for ease of use later
        Y_test = self.df_test.label.values

        applier = PandasLFApplier(lfs=self.lfs)
        L_train = applier.apply(df=self.df_train)

        # Use Label Model to combined input data
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

        # Make predictions
        probs_train = label_model.predict_proba(L=L_train)

        # Filter abstained inputs
        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=L_train)

        # Represent each data point as a one-hot vector
        vectorizer = CountVectorizer(ngram_range=(1, 5))
        X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
        X_test = vectorizer.transform(self.df_test.text.tolist())

        # Turn probs into preds
        preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

        # Train logistic regression model
        sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
        sklearn_model.fit(X=X_train, y=preds_train_filtered)

        print(
            f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%"
        )
        dump(sklearn_model, 'sklearn_model.joblib')
        dump(vectorizer, 'vectorizer.joblib')