Beispiel #1
0
    def iter_annotations(self, drop_duplicates=True, verbose=False, leave=False):
        """ Return an iterator over (form, type) tuples. """

        sorted_items = sorted(
            self.get_index().items(),
            key=lambda it: get_domain(it[1]["url"])
        )

        if verbose:
            sorted_items = tqdm(sorted_items, "Loading", mininterval=0, leave=leave)

        seen = set()
        for filename, info in sorted_items:
            with open(os.path.join(self.folder, filename), "rb") as f:
                tree = load_html(f.read(), info["url"])

            for form, tp in zip(tree.xpath("//form"), info["types"]):

                if tp == 'X':
                    continue

                if drop_duplicates:
                    fp = self.get_fingerprint(form)
                    if fp in seen:
                        continue
                    seen.add(fp)

                yield form, tp

        if verbose and leave:
            print("")
Beispiel #2
0
 def iter_trees(self, index=None):
     """
     Return an iterator over ``(filename, tree, info)`` tuples
     where ``filename`` is a relative file name, ``tree`` is a lxml tree
     and ``info`` is a dictionary with annotation data.
     """
     if index is None:
         index = self.get_index()
     sorted_items = sorted(index.items(),
                           key=lambda it: (get_domain(it[1]["url"]), it[0]))
     for path, info in sorted_items:
         tree = self.get_tree(path, info)
         yield path, tree, info
Beispiel #3
0
def get_realistic_form_labels(annotations,
                              n_splits=10,
                              model=None,
                              full_type_names=True):
    """
    Return form type labels which form type detection model
    is likely to produce.
    """
    if model is None:
        model = get_model()

    X, y = get_Xy(annotations, full_type_names)
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = [get_domain(ann.url) for ann in annotations]
    return cross_val_predict(model, X, y, cv=group_kfold, groups=groups)
Beispiel #4
0
 def iter_trees(self, index=None):
     """
     Return an iterator over ``(filename, tree, info)`` tuples
     where ``filename`` is a relative file name, ``tree`` is a lxml tree
     and ``info`` is a dictionary with annotation data.
     """
     if index is None:
         index = self.get_index()
     sorted_items = sorted(
         index.items(),
         key=lambda it: (get_domain(it[1]["url"]), it[0])
     )
     for path, info in sorted_items:
         tree = self.get_tree(path, info)
         yield path, tree, info
Beispiel #5
0
def get_annotation_folds(annotations, n_folds):
    """
    Return (train_indices, test_indices) folds iterator.
    It is guaranteed forms from the same website can't be both in
    train and test parts.

    We must be careful when splitting the dataset into training and
    evaluation parts: forms from the same domain should be in the same
    "bin". There could be several pages from the same domain, and these
    pages may have duplicate or similar forms (e.g. a search form on each
    page). If we put one such form in training dataset and another in
    evaluation dataset then the metrics will be too optimistic, and they
    can make us to choose wrong features/models. For example,
    train_test_split from scikit-learn shouldn't be used here. To fix it
    LabelKFold from scikit-learn is used.
    """
    return LabelKFold(labels=[get_domain(ann.url) for ann in annotations],
                      n_folds=n_folds)
Beispiel #6
0
def get_annotation_folds(annotations, n_folds):
    """
    Return (train_indices, test_indices) folds iterator.
    It is guaranteed forms from the same website can't be both in
    train and test parts.

    We must be careful when splitting the dataset into training and
    evaluation parts: forms from the same domain should be in the same
    "bin". There could be several pages from the same domain, and these
    pages may have duplicate or similar forms (e.g. a search form on each
    page). If we put one such form in training dataset and another in
    evaluation dataset then the metrics will be too optimistic, and they
    can make us to choose wrong features/models. For example,
    train_test_split from scikit-learn shouldn't be used here. To fix it
    LabelKFold from scikit-learn is used.
    """
    return LabelKFold(
        labels=[get_domain(ann.url) for ann in annotations],
        n_folds=n_folds
    )
def print_classification_report(annotations, n_splits=10, model=None):
    """ Evaluate model, print classification report """
    if model is None:
        # FIXME: we're overfitting on hyperparameters - they should be chosen
        # using inner cross-validation, not set to fixed values beforehand.
        model = get_model(use_precise_form_types=True)

    annotations = [a for a in annotations if a.fields_annotated]
    form_types = formtype_model.get_realistic_form_labels(
        annotations=annotations,
        n_splits=n_splits,
        full_type_names=False
    )

    X, y = get_Xy(
        annotations=annotations,
        form_types=form_types,
        full_type_names=True,
    )
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = [get_domain(ann.url) for ann in annotations]
    y_pred = cross_val_predict(model, X, y, cv=group_kfold, groups=groups,
                               n_jobs=-1)

    all_labels = list(annotations[0].field_schema.types.keys())
    labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k))
    print((flat_classification_report(y, y_pred, digits=2,
                                     labels=labels, target_names=labels)))

    print((
        "{:0.1f}% fields are classified correctly.".format(
            flat_accuracy_score(y, y_pred) * 100
        )
    ))
    print((
        "All fields are classified correctly in {:0.1f}% forms.".format(
            sequence_accuracy_score(y, y_pred) * 100
        )
    ))
def print_classification_report(annotations, n_splits=10, model=None):
    """ Evaluate model, print classification report """
    if model is None:
        # FIXME: we're overfitting on hyperparameters - they should be chosen
        # using inner cross-validation, not set to fixed values beforehand.
        model = get_model()

    X, y = get_Xy(annotations, full_type_names=True)
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = [get_domain(ann.url) for ann in annotations]
    y_pred = cross_val_predict(model, X, y, cv=group_kfold, groups=groups)

    # hack to format report nicely
    all_labels = list(annotations[0].form_schema.types.keys())
    labels = sorted(set(y_pred), key=lambda k: all_labels.index(k))
    print((classification_report(y,
                                 y_pred,
                                 digits=2,
                                 labels=labels,
                                 target_names=labels)))

    print(("{:0.1f}% forms are classified correctly.".format(
        accuracy_score(y, y_pred) * 100)))
def train(annotations,
          use_precise_form_types=True,
          optimize_hyperparameters_iters=0,
          optimize_hyperparameters_folds=5,
          optimize_hyperparameters_jobs=-1,
          full_form_type_names=False,
          full_field_type_names=True,
          verbose=True):
    def log(msg):
        if verbose:
            print(msg)

    annotations = [
        a for a in annotations if a.fields_annotated and (
            a.form_annotated or not use_precise_form_types)
    ]
    log("Training on {} forms".format(len(annotations)))

    if use_precise_form_types:
        log("Using precise form types")
        if full_form_type_names:
            form_types = np.asarray([a.type_full for a in annotations])
        else:
            form_types = np.asarray([a.type for a in annotations])
    else:
        log("Computing realistic form types")
        form_types = formtype_model.get_realistic_form_labels(
            annotations=annotations,
            n_splits=10,
            full_type_names=full_form_type_names)

    log("Extracting features")
    X, y = get_Xy(
        annotations=annotations,
        form_types=form_types,
        full_type_names=full_field_type_names,
    )

    crf = get_model(use_precise_form_types)

    if optimize_hyperparameters_iters != 0:
        if optimize_hyperparameters_iters < 50:
            warnings.warn(
                "RandomizedSearchCV n_iter is low, results may be unstable. "
                "Consider increasing optimize_hyperparameters_iters.")

        log("Finding best hyperparameters using randomized search")
        params_space = {
            'c1': scipy.stats.expon(scale=0.5),
            'c2': scipy.stats.expon(scale=0.05),
        }
        rs = RandomizedSearchCV(
            crf,
            params_space,
            cv=GroupKFold(n_splits=optimize_hyperparameters_folds),
            verbose=verbose,
            n_jobs=optimize_hyperparameters_jobs,
            n_iter=optimize_hyperparameters_iters,
            iid=False,
            scoring=scorer)
        rs.fit(X, y, groups=[get_domain(ann.url) for ann in annotations])
        crf = rs.best_estimator_
        log("Best hyperparameters: c1={:0.5f}, c2={:0.5f}".format(
            crf.c1, crf.c2))
    else:
        crf.fit(X, y)

    return crf