Beispiel #1
0
 def test_pattern_lookups(self):
     featextractor = HtmlFeatureExtractor(
         token_features=[token_lower, token_identity],
         global_features=[
             Pattern((0, 'lower'), (1, 'token'), out_value='OUT'),
         ])
     X = featextractor.transform_single(self.html_tokens)
     self.assertListEqual(
         [feat['lower/token[+1]'] for feat in X],
         ['hello/John', 'john/Doe', 'doe/Mary', 'mary/said', 'said/OUT'])
 def test_pattern_lookups(self):
     featextractor = HtmlFeatureExtractor(
         token_features = [token_lower, token_identity],
         global_features=[
             Pattern((0, 'lower'), (1, 'token'), out_value='OUT'),
         ]
     )
     X = featextractor.transform_single(self.html_tokens)
     self.assertListEqual(
         [feat['lower/token[+1]'] for feat in X],
         ['hello/John', 'john/Doe', 'doe/Mary', 'mary/said', 'said/OUT']
     )
Beispiel #3
0
    def test_pattern(self):
        featextractor = HtmlFeatureExtractor(
            token_features=[token_lower, token_identity],
            global_features=[Pattern((-2, 'lower'), (-1, 'lower'))])
        X = featextractor.transform_single(self.html_tokens)

        key = 'lower[-2]/lower[-1]'
        self.assertNotIn(key, X[0])
        self.assertListEqual(
            [feat[key] for feat in X[1:]],
            ['?/hello', 'hello/john', 'john/doe', 'doe/mary'],
        )
    def test_pattern(self):
        featextractor = HtmlFeatureExtractor(
            token_features = [token_lower, token_identity],
            global_features = [
                Pattern((-2, 'lower'), (-1, 'lower'))
            ]
        )
        X = featextractor.transform_single(self.html_tokens)

        key = 'lower[-2]/lower[-1]'
        self.assertNotIn(key, X[0])
        self.assertListEqual(
            [feat[key] for feat in X[1:]],
            ['?/hello', 'hello/john', 'john/doe', 'doe/mary'],
        )
Beispiel #5
0
def create_wapiti_pipeline(model_filename=None,
                           token_features=None,
                           global_features=None,
                           min_df=1,
                           **crf_kwargs):
    """
    Create a scikit-learn Pipeline for HTML tagging using Wapiti.
    This pipeline expects data produced by :class:`~.HtmlTokenizer`
    as an input and produces sequences of IOB2 tags as output.

    Example::

        import webstruct
        from webstruct.features import EXAMPLE_TOKEN_FEATURES

        # load train data
        html_tokenizer = webstruct.HtmlTokenizer()
        train_trees = webstruct.load_trees(
            "train/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_train, y_train = html_tokenizer.tokenize(train_trees)

        # train
        model = webstruct.create_wapiti_pipeline(
            model_filename = 'model.wapiti',
            token_features = EXAMPLE_TOKEN_FEATURES,
            train_args = '--algo l-bfgs --maxiter 50 --nthread 8 --jobsize 1 --stopwin 10',
        )
        model.fit(X_train, y_train)

        # load test data
        test_trees = webstruct.load_trees(
            "test/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_test, y_test = html_tokenizer.tokenize(test_trees)

        # do a prediction
        y_pred = model.predict(X_test)

    """
    if token_features is None:
        token_features = []

    return Pipeline([
        ('fe',
         HtmlFeatureExtractor(token_features, global_features, min_df=min_df)),
        ('crf', WapitiCRF(model_filename, **crf_kwargs)),
    ])
Beispiel #6
0
def create_crfsuite_pipeline(token_features=None,
                             global_features=None,
                             min_df=1,
                             **crf_kwargs):
    """
    Create :class:`CRFsuitePipeline` for HTML tagging using CRFsuite.
    This pipeline expects data produced by
    :class:`~.HtmlTokenizer` as an input and produces
    sequences of IOB2 tags as output.

    Example::

        import webstruct
        from webstruct.features import EXAMPLE_TOKEN_FEATURES

        # load train data
        html_tokenizer = webstruct.HtmlTokenizer()
        train_trees = webstruct.load_trees(
            "train/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_train, y_train = html_tokenizer.tokenize(train_trees)

        # train
        model = webstruct.create_crfsuite_pipeline(
            token_features = EXAMPLE_TOKEN_FEATURES,
        )
        model.fit(X_train, y_train)

        # load test data
        test_trees = webstruct.load_trees(
            "test/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_test, y_test = html_tokenizer.tokenize(test_trees)

        # do a prediction
        y_pred = model.predict(X_test)

    """
    from sklearn_crfsuite import CRF

    if token_features is None:
        token_features = []

    fe = HtmlFeatureExtractor(token_features, global_features, min_df=min_df)
    crf = CRF(**crf_kwargs)

    return CRFsuitePipeline(fe, crf)