def test_pattern_lookups(self): featextractor = HtmlFeatureExtractor( token_features=[token_lower, token_identity], global_features=[ Pattern((0, 'lower'), (1, 'token'), out_value='OUT'), ]) X = featextractor.transform_single(self.html_tokens) self.assertListEqual( [feat['lower/token[+1]'] for feat in X], ['hello/John', 'john/Doe', 'doe/Mary', 'mary/said', 'said/OUT'])
def test_pattern_lookups(self): featextractor = HtmlFeatureExtractor( token_features = [token_lower, token_identity], global_features=[ Pattern((0, 'lower'), (1, 'token'), out_value='OUT'), ] ) X = featextractor.transform_single(self.html_tokens) self.assertListEqual( [feat['lower/token[+1]'] for feat in X], ['hello/John', 'john/Doe', 'doe/Mary', 'mary/said', 'said/OUT'] )
def test_pattern(self): featextractor = HtmlFeatureExtractor( token_features=[token_lower, token_identity], global_features=[Pattern((-2, 'lower'), (-1, 'lower'))]) X = featextractor.transform_single(self.html_tokens) key = 'lower[-2]/lower[-1]' self.assertNotIn(key, X[0]) self.assertListEqual( [feat[key] for feat in X[1:]], ['?/hello', 'hello/john', 'john/doe', 'doe/mary'], )
def test_pattern(self): featextractor = HtmlFeatureExtractor( token_features = [token_lower, token_identity], global_features = [ Pattern((-2, 'lower'), (-1, 'lower')) ] ) X = featextractor.transform_single(self.html_tokens) key = 'lower[-2]/lower[-1]' self.assertNotIn(key, X[0]) self.assertListEqual( [feat[key] for feat in X[1:]], ['?/hello', 'hello/john', 'john/doe', 'doe/mary'], )
def create_wapiti_pipeline(model_filename=None, token_features=None, global_features=None, min_df=1, **crf_kwargs): """ Create a scikit-learn Pipeline for HTML tagging using Wapiti. This pipeline expects data produced by :class:`~.HtmlTokenizer` as an input and produces sequences of IOB2 tags as output. Example:: import webstruct from webstruct.features import EXAMPLE_TOKEN_FEATURES # load train data html_tokenizer = webstruct.HtmlTokenizer() train_trees = webstruct.load_trees( "train/*.html", webstruct.WebAnnotatorLoader() ) X_train, y_train = html_tokenizer.tokenize(train_trees) # train model = webstruct.create_wapiti_pipeline( model_filename = 'model.wapiti', token_features = EXAMPLE_TOKEN_FEATURES, train_args = '--algo l-bfgs --maxiter 50 --nthread 8 --jobsize 1 --stopwin 10', ) model.fit(X_train, y_train) # load test data test_trees = webstruct.load_trees( "test/*.html", webstruct.WebAnnotatorLoader() ) X_test, y_test = html_tokenizer.tokenize(test_trees) # do a prediction y_pred = model.predict(X_test) """ if token_features is None: token_features = [] return Pipeline([ ('fe', HtmlFeatureExtractor(token_features, global_features, min_df=min_df)), ('crf', WapitiCRF(model_filename, **crf_kwargs)), ])
def create_crfsuite_pipeline(token_features=None, global_features=None, min_df=1, **crf_kwargs): """ Create :class:`CRFsuitePipeline` for HTML tagging using CRFsuite. This pipeline expects data produced by :class:`~.HtmlTokenizer` as an input and produces sequences of IOB2 tags as output. Example:: import webstruct from webstruct.features import EXAMPLE_TOKEN_FEATURES # load train data html_tokenizer = webstruct.HtmlTokenizer() train_trees = webstruct.load_trees( "train/*.html", webstruct.WebAnnotatorLoader() ) X_train, y_train = html_tokenizer.tokenize(train_trees) # train model = webstruct.create_crfsuite_pipeline( token_features = EXAMPLE_TOKEN_FEATURES, ) model.fit(X_train, y_train) # load test data test_trees = webstruct.load_trees( "test/*.html", webstruct.WebAnnotatorLoader() ) X_test, y_test = html_tokenizer.tokenize(test_trees) # do a prediction y_pred = model.predict(X_test) """ from sklearn_crfsuite import CRF if token_features is None: token_features = [] fe = HtmlFeatureExtractor(token_features, global_features, min_df=min_df) crf = CRF(**crf_kwargs) return CRFsuitePipeline(fe, crf)