Esempio n. 1
0
def startSnorkelLabeling(df, keyword_groups={}, label=IRRELEVANT, l_type='SnorkelFilter'):
    '''
    Function: Filter words for user
    Inputs:
        - df: tweets DataFrame (columns: [id, text])
        - keywords: Keyword group and its relevant keywords
          E.g. {'usps': ['postal service', 'usps'], 'invest': ['invest','portfolio','stock']}
    Outputs:
        - a_df: Categorised Data (e.g. columns = ['id', 'tweets', 'Refund', 'COVID'])
        - analysis: Snorkel Labeling Function statistics
    '''

    lfs = []
    for name, keywords in keyword_groups.items():
        lfs.append(make_keyword_lf(lf_name=name, keywords=keywords, label=label))

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df)

    if l_type == 'SnorkelFilter': # For spam detection (Step 2)
        L_final = get_L_final_filter(L_train)
        df['relevance'] = L_final

    elif l_type == 'SnorkelCategorise': # For categorising tweets (Step 3)
        L_final = get_L_final_categorise(L_train)

        L_final_with_names = dict(zip(keyword_groups.keys(), L_final))
        for name, L_values in L_final_with_names.items():
            df[name] = L_values

    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    #return L_train, L_final, df, analysis
    return df, analysis
Esempio n. 2
0
    def train(self, dataset):
        # Apply labeler functions to training set
        lfs_applier = PandasLFApplier(lfs=self.lfs)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            lfs_train = lfs_applier.apply(df=dataset)

        # Build probabilistic label model
        label_model = LabelModel(cardinality=3, verbose=True)
        label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42)
        label_probs = label_model.predict_proba(lfs_train)

        # Filter unlabeled data points
        df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset,
                                                                 y=label_probs,
                                                                 L=lfs_train)

        # Featurize data using scikit
        self.vectorizer = CountVectorizer(ngram_range=(1, 5))
        dataset_train = self.vectorizer.fit_transform(
            df_filtered.sentence.tolist())

        # Replace probabilistic labels with most likely label
        preds_filtered = probs_to_preds(probs=probs_filtered)

        # Train scikit model
        self.model = LogisticRegression(C=1e3,
                                        solver="liblinear",
                                        multi_class='auto')
        self.model.fit(X=dataset_train, y=preds_filtered)
def snorkel_process(keylist, dataframe, allweaklabf):
    def func(x):
        idx = (-x).argsort()[1:]
        x[idx] = 0
        return x

    cardinalitynu = len(keylist)
    applier = PandasLFApplier(lfs=allweaklabf)
    all_train_l = applier.apply(df=dataframe)
    report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary()
    print(report)
    label_model = LabelModel(cardinality=cardinalitynu, verbose=False)
    label_model.fit(all_train_l)
    predt = label_model.predict(all_train_l)
    predt1 = label_model.predict_proba(all_train_l)
    keylist1 = keylist.copy()
    #keylist1.append('Not_relevent')
    predt2 = pd.DataFrame(predt1, columns=keylist1)
    dataframe['L_label'] = predt
    dataframe1 = dataframe.join(predt2, how='outer')
    dataframe1 = dataframe1[dataframe1.L_label >= 0]

    train, test = train_test_split(dataframe1, test_size=0.2)

    trainsent = train.sent.values
    trainlabel = train[keylist].values
    trainlabe2 = trainlabel.copy()
    np.apply_along_axis(func, 1, trainlabe2)
    trainlabe2 = np.where(trainlabe2 > 0, 1, 0)
    testsent = test.sent.values
    testlabel = test[keylist].values
    testlabe2 = testlabel.copy()
    np.apply_along_axis(func, 1, testlabe2)
    testlabe2 = np.where(testlabe2 > 0, 1, 0)
    return trainsent, trainlabe2, testsent, testlabe2, keylist, report
def generate_labels_with_snorkel(dataframe):
    """
    Labels the full data using Snorkel
    :param dataframe: Pandas dataframe containing all data
    :return: dataframe extended with a label column
    """

    # Define the set of labeling functions (LFs)
    lfs = [
        lf_ubo_is_company, lf_troika_company, lf_uk_blacklisted_company,
        lf_non_uk_blacklisted_company
    ]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(dataframe)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    dataframe["label"] = label_model.predict(L=L_train,
                                             tie_break_policy="abstain")

    # Filter out the abstain data points
    dataframe = dataframe[dataframe.label != ABSTAIN]

    return dataframe
Esempio n. 5
0
 def test_lf_applier_pandas(self) -> None:
     df = pd.DataFrame(dict(num=DATA))
     applier = PandasLFApplier([f, g])
     L = applier.apply(df, progress_bar=False)
     np.testing.assert_equal(L, L_EXPECTED)
     L = applier.apply(df, progress_bar=True)
     np.testing.assert_equal(L, L_EXPECTED)
Esempio n. 6
0
 def lf_examples(self, lf_id, n=5):
     lf = self.lfs[lf_id]
     applier = PandasLFApplier(lfs=[lf])
     L_train = applier.apply(df=self.df_train)
     labeled_examples = self.df_train[L_train != -1]
     samples = labeled_examples.sample(min(n, len(labeled_examples)),
                                       random_state=13)
     return [{"text": t} for t in samples["text"].values]
Esempio n. 7
0
    def train_model(self,
                    df_train: pd.DataFrame,
                    application_area_lfs: list,
                    analysis_path: str = "output",
                    label_output_path: str = "labels.jsonl",
                    save_model_path: str = None):
        """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points

        :param df_train: The training data for the model
        :type df_train: pd.DataFrame
        :param application_area_lfs: A list of labeling functions to use in training the Label Model
        :type application_area_lfs: list
        :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output`
        :type analysis_path: str, optional
        :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl"
        :type label_output_path: str, optional
        :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved
        :type save_model_path: str, optional
        """
        file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        applier = PandasLFApplier(lfs=application_area_lfs)
        L_train = applier.apply(df=df_train)

        model = LabelModel(cardinality=2, verbose=True)
        model.fit(L_train=L_train, n_epochs=800, log_freq=100)
        if (save_model_path is not None):
            model.save(save_model_path)

        int_labels, prob_labels = model.predict(L=L_train,
                                                return_probs=True,
                                                tie_break_policy="abstain")
        probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=prob_labels, L=L_train)

        int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=int_labels, L=L_train)
        # write out both labels. In the probability outputs, p_rel is the second probability listed
        assert list(probs_df_train_filtered["paperid"]) == list(
            int_df_train_filtered["paperid"])
        with open(f"{label_output_path}", mode="w") as out:
            for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]):
                out.write(
                    json.dumps({
                        "id": paper_id,
                        # cast to int and float to get rid of nonserializable numpy types
                        "is_rel": int(int_train_filtered[idx]),
                        "p_rel": float(probs_train_filtered[idx][1])
                    }) + "\n")

        # output LF analysis to csv file sorted by coverage
        lf_analysis = LFAnalysis(L=L_train,
                                 lfs=application_area_lfs).lf_summary()
        with open(
                f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv",
                "w") as outfile:
            lf_analysis = lf_analysis.sort_values("Coverage")
            lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
Esempio n. 8
0
 def lf_mistakes(self, lf_id, n=5):
     lf = self.lfs[lf_id]
     applier = PandasLFApplier(lfs=[lf])
     L_dev = applier.apply(df=self.df_dev).squeeze()
     labeled_examples = self.df_dev[(L_dev != -1)
                                    & (L_dev != self.df_dev["label"])]
     samples = labeled_examples.sample(min(n, len(labeled_examples)),
                                       random_state=13)
     return [{"text": t} for t in samples["text"].values]
 def test_lf_applier_pandas(self) -> None:
     df = pd.DataFrame(dict(num=DATA))
     applier = PandasLFApplier([f, g])
     L = applier.apply(df, progress_bar=False)
     np.testing.assert_equal(L, L_EXPECTED)
     L = applier.apply(df, progress_bar=True)
     np.testing.assert_equal(L, L_EXPECTED)
     L, meta = applier.apply(df, return_meta=True)
     np.testing.assert_equal(L, L_EXPECTED)
     self.assertEqual(meta, ApplierMetadata(dict()))
def apply_lfs_to_dataset(
    lfs: List[LabelingFunction],
    artifact_df: pd.DataFrame,
    save_to: AbsolutePath,
) -> np.ndarray:
    applier = PandasLFApplier(lfs=lfs)
    applied_lf_matrix = applier.apply(df=artifact_df)
    df = pd.DataFrame(applied_lf_matrix, columns=[lf.name for lf in lfs])
    df.to_pickle(str(save_to))
    return applied_lf_matrix
Esempio n. 11
0
def get_snorkel_labels(train_df, lfs, labels):
    applier = PandasLFApplier(
        [labeling_function(name=lf.__name__)(lf) for lf in lfs])
    label_model = LabelModel(cardinality=len(labels), verbose=True)
    L_train = applier.apply(df=train_df)
    label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)
    L_probs = label_model.predict_proba(L=L_train)

    df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df,
                                                             y=L_probs,
                                                             L=L_train)
    return df_filtered, probs_filtered
Esempio n. 12
0
def get_majority_vote_label(train_df, lfs, labels):
    applier = PandasLFApplier(
        [labeling_function(name=lf.__name__)(lf) for lf in lfs])
    label_model = LabelModel(cardinality=len(labels), verbose=True)
    L_train = applier.apply(df=train_df)
    majority_model = MajorityLabelVoter(cardinality=len(labels))
    preds_train = majority_model.predict(L=L_train)

    non_abstain_idxs = np.argwhere(preds_train >= 0).flatten()
    df_filtered = train_df.iloc[non_abstain_idxs]
    probs_filtered = preds_train[non_abstain_idxs]
    return df_filtered, probs_filtered
def LF_applier(df_train: pd.DataFrame, df_test: pd.DataFrame):
    """Create the labling functions and apply those labeling functions on the data points

    :param df_train: The training dataset
    :type df_train: pd.DataFrame
    :param df_test: The gold labels
    :type df_test: pd.DataFrame
    :return: Return the matrix of labels emitted by the labeling functions
    :rtype: List[np.ndarray, np.ndarray, List[LabelingFunction]]
    """
    # Make keywords
    keyword_vehicle_detection = make_keyword_lf(
        keywords=["vehicle detection", "vehicle detector"])
    keyword_driver_identification = make_keyword_lf(
        keywords=["driver identification", "driver identifier"])
    keyword_human_detection = make_keyword_lf(
        keywords=["human detection", "human detector"])
    keyword_license_info = make_keyword_lf(
        keywords=["license plate", "license number"])
    keyword_vehicle_recognition = make_keyword_lf(
        keywords=["vehicle recognition", "vehicle identification"])
    keyword_driving_system = make_keyword_lf(keywords=["driving system"])
    keyword_autonomous_vehicle = make_keyword_lf(
        keywords=["autonomous vehicle"], label=IRRELEVANT)
    keyword_driverless_vehicle = make_keyword_lf(
        keywords=["driverless cars", "driverless vehicle", "unmanned vehicle"],
        label=IRRELEVANT)
    keyword_lidar = make_keyword_lf(keywords=["lidar", "laser detection"],
                                    label=IRRELEVANT)
    keyword_radar = make_keyword_lf(keywords=["radar", "vehicle radar"],
                                    label=IRRELEVANT)
    keyword_computer_vision = make_keyword_lf(
        keywords=["computer vision", "opencv"], label=IRRELEVANT)

    # Apply LFs
    lfns = [
        keyword_vehicle_detection, keyword_human_detection,
        keyword_driver_identification, keyword_vehicle_recognition,
        keyword_driving_system, keyword_autonomous_vehicle,
        keyword_driverless_vehicle, keyword_lidar, keyword_radar,
        keyword_computer_vision
    ]

    applier = PandasLFApplier(lfs=lfns)
    apply_train_time_start = time()
    L_train = applier.apply(df=df_train)
    apply_train_time_end = time()
    print(
        f"LF Application Time: {apply_train_time_end - apply_train_time_start} seconds"
    )
    L_test = applier.apply(df=df_test)
    return [L_train, L_test, lfns]
    def test_lf_applier_pandas_spacy_preprocessor(self) -> None:
        spacy = SpacyPreprocessor(text_field="text", doc_field="doc")

        @labeling_function(pre=[spacy])
        def first_is_name(x: DataPoint) -> int:
            return 0 if x.doc[0].pos_ == "PROPN" else -1

        @labeling_function(pre=[spacy])
        def has_verb(x: DataPoint) -> int:
            return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1

        df = pd.DataFrame(dict(text=TEXT_DATA))
        applier = PandasLFApplier([first_is_name, has_verb])
        L = applier.apply(df, progress_bar=False)
        np.testing.assert_equal(L, L_TEXT_EXPECTED)
    def test_lf_applier_pandas_preprocessor_memoized(self) -> None:
        square_hit_tracker = SquareHitTracker()

        @preprocessor(memoize=True)
        def square_memoize(x: DataPoint) -> DataPoint:
            x.num_squared = square_hit_tracker(x.num)
            return x

        @labeling_function(pre=[square_memoize])
        def fp_memoized(x: DataPoint) -> int:
            return 0 if x.num_squared > 42 else -1

        df = pd.DataFrame(dict(num=DATA))
        applier = PandasLFApplier([f, fp_memoized])
        L = applier.apply(df, progress_bar=False)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
        self.assertEqual(square_hit_tracker.n_hits, 4)
Esempio n. 16
0
def get_snorkel_labels(frame_to_train, pkl_name):
    print(
        "==============================Labeling is now started======================================="
    )
    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=frame_to_train)
    date_parser_coverage, currency_coverage,\
    zipcode_coverage,state_coverage,\
    quntity_coverage,phonenumber_coverage,SSN_coverage,\
    first_name_coverage,last_name_coverage,percent_coverge= (L_train != ABSTAIN).mean(axis=0)
    frame_to_train.rename(columns={
        "word_id": "word_tokens",
        "text": "ocr",
        "label_number": "preds"
    },
                          inplace=True)
    print(
        "==============================Labeling is now complete======================================="
    )
    print(
        "==============================Summary Stats=================================================="
    )
    print(f"date_parser_coverage: {date_parser_coverage * 100:.1f}%")
    print(f"currency_coverage: {currency_coverage * 100:.1f}%")
    print(f"zipcode_coverage: {zipcode_coverage * 100:.1f}%")
    print(f"state_coverage: {state_coverage * 100:.1f}%")
    print(f"quntity_coverage: {quntity_coverage * 100:.1f}%")
    print(f"phonenumber_coverage: {phonenumber_coverage * 100:.1f}%")
    print(f"SSN_coverage: {SSN_coverage * 100:.1f}%")
    print(f"first_name_coverage: {first_name_coverage * 100:.1f}%")
    print(f"last_name_coverage: {last_name_coverage * 100:.1f}%")
    #print(f"alpha_number_coverage: {alpha_number_coverage * 100:.1f}%")
    lol = f"pickle_files/{pkl_name}.pkl"
    print("File name I got:", lol)
    print(f"percent_coverage: {percent_coverge * 100:.1f}%")
    with open(lol, 'rb') as f:
        label_model = pickle.load(f)
    #label_model = LabelModel(cardinality=15, verbose=True)
    #label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
    frame_to_train["label_number"] = label_model.predict(
        L=L_train, tie_break_policy="abstain")
    frame_to_train.label_number.fillna(0, inplace=True)
    frame_to_train['pred_names'] = frame_to_train.label_number.map(inv_et_dct)
    return frame_to_train
    #dataset_df = pd.DataFrame()
    return frame_to_train
Esempio n. 17
0
def apply_lf_on_data(df_train,df_dev,sentences_number):
    """
    This function apply the labeling functions (from labeled_function.py)
    on given train data frame.
    Other parameters: df_dev (for further developing the LFs) and sentences_number for inner use.
    Return the train df with the tagging.
    """
    print("")
    print("Labeling Functions:")

    # Y_dev = df_dev.tag.values
    lfs = [labeled_function.masechet_then_parans, labeled_function.perek_then_parans,
           labeled_function.daf_in_parntes, labeled_function.no_double_parans,
           labeled_function.no_mishna]
    applier = PandasLFApplier(lfs=lfs)

    print("-Applying the labeling functions...")
    l_train = applier.apply(df=df_train)
    # l_dev = applier.apply(df=df_dev)

    print_analysis(l_train,lfs)

    print("-Applying the MajorityLabelVoter...")
    majority_model = MajorityLabelVoter()
    preds_train = majority_model.predict(L=l_train)

    #put predicted labels in df train
    print("-Removing unnecessary n-grams...")
    df_train['tag'] = preds_train
    for i in range(sentences_number):
        df_filter_by_sentences = df_train.loc[df_train['sentence_index'] == i]
        df_filter = df_filter_by_sentences.loc[df_filter_by_sentences['tag'] == 1]
        # this section handles cases of positively tagged ngram within a bigger positively tagged ngram, and removes it.
        for row_checked in df_filter.index:
            for row_other in df_filter.index:
                if df_filter['n_gram_id'][row_checked] != df_filter['n_gram_id'][row_other] and \
                        df_filter['text'][row_checked] in df_filter['text'][row_other]:
                    df_train = df_train[df_train.n_gram_id != df_filter['n_gram_id'][row_checked]]
                    break

    print("-Dropping the abstained and extra columns...")
    df_train = df_train.drop(["sentence_index","n_gram_id"],axis=1)
    df_train = df_train[df_train['tag'] != ABSTAIN]
    print("DONE")
    return df_train
    def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = (
            [f]
            + [get_positive_labeling_function(divisor) for divisor in range(2, 9)]
            + [get_negative_labeling_function(divisor) for divisor in range(2, 9)]
        )
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'):
    df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv')
    #from utils import load_unlabeled_spam_dataset
    #df_train = load_unlabeled_spam_dataset()

    # Define the set of labeling functions (LFs)
    #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr]

    #lfs = [lf_keyword_keywords]

    lfs = [lf_keyword_wateroverlast]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(df_train)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    df_train["label"] = label_model.predict(L=L_train,
                                            tie_break_policy="abstain")
    #tie_break_policy="true-random"
    #tie_break_policy="abstain"
    counter = 0
    for i in range(len(df_train["label"])):
        if df_train["label"][i] == WATER:
            print()
            print(df_train["text"][i])
            print(df_train["label"][i])
            print()
            counter += 1

    print("num entries total: " + str(len(df_train["label"])))
    print("num entries water: " + str(counter))

    #df_train = df_train[df_train.label != ABSTAIN]

    twitter_curated = df_train[df_train.label == WATER]
    twitter_curated = twitter_curated.drop(columns='label')
    twitter_curated.to_csv(save_name, index=False)
Esempio n. 20
0
    def apply_metamap(self, dataset: DataFrame):
        """
		Apply metamap to a list of sentences
		"""
        train_data = dataset.loc[dataset["split"] == "train"]
        test_data = dataset.loc[dataset["split"] == "test"]

        applier = applier = PandasLFApplier(lfs=[metamap])

        l_metamap_train = applier.apply(df=train_data.head(10))
        l_metamap_test = applier.apply(df=test_data.head(10))

        return l_metamap_train, l_metamap_test
def label_model_creator(df_dev, Y_dev, df_train, df_test, Y_test):

    # Accumulate all the labeling_functions for supply
    supply_lfs = [
        lf_supply, lf_customer, lf_sales_to, lf_our_customer, lf_acquisition,
        lf_people, lf_sold, lf_relation, lf_competition
    ]

    # Apply the above labeling functions to the data in Pandas dataframe formats
    applier = PandasLFApplier(supply_lfs)

    # Use the applier of the labeling functions to both development set and train set
    L_dev = applier.apply(df_dev)
    L_train = applier.apply(df_train)
    L_test = applier.apply(df_test)

    # caridnality : 2 (True and False)
    label_model = LabelModel(cardinality=2, verbose=True)

    # Fit the label_model
    label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500)

    # accuracy for the label model using the test set
    label_model_acc = label_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="random")["accuracy"]
    print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

    # check the F-1 score and ROC_AUC score
    probs_dev = label_model.predict_proba(L_dev)
    preds_dev = probs_to_preds(probs_dev)
    print(
        f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}"
    )
    print(
        f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
    )

    return label_model, L_train
Esempio n. 22
0
    def train(self):
        '''
        Train the logistic regression discriminative model
        '''
        # We pull out the label vectors for ease of use later
        Y_test = self.df_test.label.values

        applier = PandasLFApplier(lfs=self.lfs)
        L_train = applier.apply(df=self.df_train)

        # Use Label Model to combined input data
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

        # Make predictions
        probs_train = label_model.predict_proba(L=L_train)

        # Filter abstained inputs
        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=L_train)

        # Represent each data point as a one-hot vector
        vectorizer = CountVectorizer(ngram_range=(1, 5))
        X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
        X_test = vectorizer.transform(self.df_test.text.tolist())

        # Turn probs into preds
        preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

        # Train logistic regression model
        sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
        sklearn_model.fit(X=X_train, y=preds_train_filtered)

        print(
            f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%"
        )
        dump(sklearn_model, 'sklearn_model.joblib')
        dump(vectorizer, 'vectorizer.joblib')
Esempio n. 23
0
def labeling_evaluation(df_train, df_test, label_model):
    lfs = [
        LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short,
        LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword,
        LabelingFunction.lf_surname_re, LabelingFunction.industry_cls
    ]

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df_train)
    L_test = applier.apply(df=df_test)
    Y_test = df_test.label.values
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    if label_model == "majority":
        majority_model = MajorityLabelVoter()
        preds_train = majority_model.predict(L=L_train)
        majority_acc = majority_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

        df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=preds_train, L=L_train)
        return df_train_filtered, preds_train_filtered, analysis

    if label_model == "weighted":
        label_model = LabelModel(cardinality=len(
            [c for c in dir(Polarity) if not c.startswith("__")]),
                                 verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        probs_train = label_model.predict_proba(L_train)
        label_model_acc = label_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
        preds_train_filtered = probs_to_preds(probs_train_filtered)
        return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def weak_supervisor(dataframe, model_type):
    labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function,
                          negative1_labeling_function]
    pandasApplier = PandasLFApplier(lfs=labeling_functions)
    label_training_matrix = pandasApplier.apply(df=dataframe)

    if model_type == "label_model":
        # constructing a probabilistic label model
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123)
        dataframe["weak_labels"] = label_model.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe

    else:
        majorityLabelVoter = MajorityLabelVoter()
        dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix)
        print("dataframe shape: ", dataframe.shape)
        dataframe = dataframe[dataframe["weak_labels"] != -1]
        print("dataframe shape after filtering: ", dataframe.shape)
        return dataframe
Esempio n. 25
0
    def getTrainedModel2(self):
        # Apply the LFs to the unlabeled training data
        applier = PandasLFApplier(self.LFs)
        L_train = applier.apply(self.train['comments'])

        # Train the label model and compute the training labels
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
        self.train['resolution'] = label_model.predict(
            L=L_train, tie_break_policy="abstain")
        df_train = self.train[self.train.resolution != self.ABSTAIN]

        train_text = df_train.comments.tolist()
        X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text)

        clf = LogisticRegression(solver="lbfgs")
        clf.fit(X=X_train, y=df_train.resolution.values)
        prob = clf.predict_proba(self.test)

        if torch.cuda.is_available():
            device = 'cuda'
        else:
            device = 'cpu'
        end_model = EndModel([1000, 10, 2], seed=123, device=device)

        end_model.train_model(
            (self.train['comments'], self.test['comments']),
            valid_data=(self.train['resolution'], self.test['comments']),
            lr=0.01,
            l2=0.01,
            batch_size=256,
            n_epochs=5,
            checkpoint_metric='accuracy',
            checkpoint_metric_mode='max')

        return prob
 def test_lf_applier_pandas_fault(self) -> None:
     df = pd.DataFrame(dict(num=DATA))
     applier = PandasLFApplier([f, f_bad])
     with self.assertRaises(AttributeError):
         applier.apply(df, progress_bar=False)
     L = applier.apply(df, progress_bar=False, fault_tolerant=True)
     np.testing.assert_equal(L, L_EXPECTED_BAD)
     L, meta = applier.apply(
         df, progress_bar=False, fault_tolerant=True, return_meta=True
     )
     np.testing.assert_equal(L, L_EXPECTED_BAD)
     self.assertEqual(meta, ApplierMetadata(dict(f_bad=5)))
Esempio n. 27
0
def predict_documents(documents: pd.DataFrame, trigger_label_model: LabelModel,
                      role_label_model: LabelModel):
    if 'event_triggers' not in documents and 'event_roles' not in documents:
        documents = documents.apply(pipeline.add_default_events, axis=1)

    # 1. Get trigger probabilities
    df_predict_triggers, _ = pipeline.build_event_trigger_examples(documents)
    trigger_lf_applier = PandasLFApplier(pipeline.get_trigger_list_lfs())
    L_predict_triggers = trigger_lf_applier.apply(df_predict_triggers)
    event_trigger_probs = trigger_label_model.predict_proba(L_predict_triggers)

    merged_event_trigger_examples = pipeline.merge_event_trigger_examples(
        df_predict_triggers,
        utils.zero_out_abstains(event_trigger_probs, L_predict_triggers))

    # 2. Get role probabilities
    df_predict_roles, _ = pipeline.build_event_role_examples(documents)
    role_lf_applier = PandasLFApplier(pipeline.get_role_list_lfs())
    L_predict_roles = role_lf_applier.apply(df_predict_roles)
    event_roles_probs = role_label_model.predict_proba(L_predict_roles)

    merged_event_role_examples = pipeline.merge_event_role_examples(
        df_predict_roles,
        utils.zero_out_abstains(event_roles_probs, L_predict_roles))

    # 3. Update documents with trigger & role probabilities
    labeled_documents: pd.DataFrame = documents.copy()
    # Make sure to remove event_triggers and roles that were built per default
    for idx, row in labeled_documents.iterrows():
        row['event_triggers'] = []
        row['event_roles'] = []
    if 'id' in labeled_documents:
        labeled_documents.set_index('id', inplace=True)

    triggers = merged_event_trigger_examples[['event_triggers']]
    roles = merged_event_role_examples[['event_roles']]

    labeled_documents.update(triggers)
    labeled_documents.update(roles)

    labeled_documents.reset_index(level=0, inplace=True)

    # 4. Add ACE events
    labeled_documents = ace_formatter.snorkel_to_ace_format(labeled_documents)
    return labeled_documents
Esempio n. 28
0
def apply_heuristics(args) -> Dict[str, Any]:
    stats: Dict[str, Any] = {}
    train_filename = PROJECT_DIR / args.dataset_path_train
    test_filename = PROJECT_DIR / args.dataset_path_test

    df_train = pd.read_csv(train_filename,
                           sep=';',
                           index_col=0,
                           nrows=args.rows_train)
    df_test = pd.read_csv(test_filename,
                          sep=';',
                          index_col=0,
                          nrows=args.rows_test)

    stats['commits_train'] = len(df_train.index)
    stats['bugs_fraction'] = (df_train.label.values == BUG).mean()
    stats['nonbugs_fraction'] = (df_train.label.values == BUGLESS).mean()

    clean_text_columns(df_train)
    clean_text_columns(df_test)

    lfs = bugs.heuristics + nonbugs.heuristics

    stats['n_labeling_functions'] = len(lfs)

    applier = PandasLFApplier(lfs=lfs)
    L_dev = applier.apply(df=df_train)
    L_dev.dump(PROJECT_DIR / args.save_heuristics_matrix_train_to)

    applier = PandasLFApplier(lfs=lfs)
    L_test = applier.apply(df=df_test)
    L_test.dump(PROJECT_DIR / args.save_heuristics_matrix_test_to)

    stats['coverage_train'] = sum((L_dev != -1).any(axis=1)) / len(L_dev)
    stats['coverage_test'] = sum((L_test != -1).any(axis=1)) / len(L_test)

    stats['majority_accuracy_train'] = majority_acc(L_dev, df_train)
    stats['majority_accuracy_test'] = majority_acc(L_test, df_test)

    return stats
# Our next step is to apply the labeling functions we wrote to the unlabeled training data.
# The result is a *label matrix*, `L_train`, where each row corresponds to a data point and each column corresponds to a labeling function.
# Since the labeling functions have unknown accuracies and correlations, their output labels may overlap and conflict.
# We use the `LabelModel` to automatically estimate their accuracies and correlations, reweight and combine their labels, and produce our final set of clean, integrated training labels:

# %%
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

# Define the set of labeling functions (LFs)
lfs = [
    lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity
]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

# %% [markdown]
# Note that we used the `LabelModel` to label data; however, on many data points, all the labeling functions abstain, and so the `LabelModel` abstains as well.
# We'll filter these data points out of our training set now:

# %%
df_train = df_train[df_train.label != ABSTAIN]

# %% [markdown]
Esempio n. 30
0
# [`PandasLFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.PandasLFApplier.html).
# Correspondingly, a single data point `x` that's passed into our LFs will be a [Pandas `Series` object](https://pandas.pydata.org/pandas-docs/stable/reference/series.html).
#
# It's important to note that these LFs will work for any object with an attribute named `text`, not just Pandas objects.
# Snorkel has several other appliers for different data point collection types which you can browse in the [API documentation](https://snorkel.readthedocs.io/en/master/packages/labeling.html).
#
# The output of the `apply(...)` method is a ***label matrix***, a fundamental concept in Snorkel.
# It's a NumPy array `L` with one column for each LF and one row for each data point, where `L[i, j]` is the label that the `j`th labeling function output for the `i`th data point.
# We'll create one label matrix for the `train` set and one for the `dev` set.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier

lfs = [check_out, check]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)

# %%
L_train

# %% [markdown]
# ### c) Evaluate performance on training and development sets

# %% [markdown]
# We can easily calculate the coverage of these LFs (i.e., the percentage of the dataset that they label) as follows:

# %%
coverage_check_out, coverage_check = (L_train != ABSTAIN).mean(axis=0)
print(f"check_out coverage: {coverage_check_out * 100:.1f}%")