def test_get_and_union_features_tuple(self): features = util.get_and_union_features([('feat1', WeningerFeatures()), ('feat2', KohlschuetterFeatures())]) self.assertIsInstance(features, FeatureUnion) self.assertEqual([t[0] for t in features.transformer_list], ['feat1', 'feat2'])
def test_get_and_union_features_tuple(): features = util.get_and_union_features( [('feat1', WeningerFeatures()), ('feat2', KohlschuetterFeatures())] ) assert isinstance(features, FeatureUnion) assert [t[0] for t in features.transformer_list] == ['feat1', 'feat2']
def test_get_and_union_features_instances(): features = util.get_and_union_features([WeningerFeatures(), KohlschuetterFeatures()]) assert isinstance(features, FeatureUnion) assert [t[0] for t in features.transformer_list] == ['weningerfeatures', 'kohlschuetterfeatures']
def test_get_and_union_features_instance(): features = util.get_and_union_features(WeningerFeatures()) assert isinstance(features, WeningerFeatures)
def test_get_and_union_features_str(): features = util.get_and_union_features('weninger') assert isinstance(features, WeningerFeatures)
def test_extractor(html): prob_threshold = 0.5 blockifier = TagCountNoCSSReadabilityBlockifier() features = get_and_union_features( ['weninger', 'kohlschuetter', 'readability']) # initialize model from pre-fit attributes model_attrs = { 'C': 1.0, 'class_weight': None, 'classes_': [0, 1], 'coef_': [[ 0.00501458328421719, -0.0006331822163374379, -0.6699789320373452, 0.026069227973339763, -1.5552477377277252, 0.02980432745983307, -0.965575689884716, 0.019509367890934326, -0.35692924115362307 ]], 'dual': False, 'fit_intercept': True, 'intercept_': [-1.2071425754440765], 'intercept_scaling': 1, 'max_iter': 100, 'multi_class': 'ovr', 'n_iter_': [12], 'n_jobs': 1, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.0001, 'warm_start': False } model = LogisticRegression() for k, v in model_attrs.items(): if isinstance(v, list): setattr(model, k, np.array(v)) else: setattr(model, k, v) # extract content via the extractor class extractor = Extractor(blockifier, features=features, model=model, to_extract='content', prob_threshold=prob_threshold) extractor_content = extractor.extract(html) # extract content via individual components blocks = blockifier.blockify(html) features_mat = features.transform(blocks) positive_idx = list(model.classes_).index(1) preds = (model.predict_proba(features_mat) > prob_threshold)[:, positive_idx].astype(int) components_content = '\n'.join( str_cast(blocks[ind].text) for ind in np.flatnonzero(preds)) assert extractor_content is not None assert extractor_content == components_content
def test_get_and_union_features_instances(self): features = util.get_and_union_features( [WeningerFeatures(), KohlschuetterFeatures()]) self.assertIsInstance(features, FeatureUnion) self.assertEqual([t[0] for t in features.transformer_list], ['weningerfeatures', 'kohlschuetterfeatures'])
def test_get_and_union_features_instance(self): features = util.get_and_union_features(WeningerFeatures()) self.assertIsInstance(features, WeningerFeatures)
def test_get_and_union_features_str(self): features = util.get_and_union_features('weninger') self.assertIsInstance(features, WeningerFeatures)