Exemple #1
0
 def fit(self, dpacks, targets):
     dpack = DataPack.vstack(dpacks)
     target = np.concatenate(targets)
     self._learner.fit(dpack.data, target)
     self._labels = [dpack.get_label(x) for x in self._learner.classes_]
     self._fitted = True
     return self
Exemple #2
0
    def fit(self, dpacks, targets, nonfixed_pairs=None):
        # WIP select only the nonfixed pairs
        if nonfixed_pairs is not None:
            dpacks = [dpack.selected(nf_pairs)
                      for dpack, nf_pairs in zip(dpacks, nonfixed_pairs)]
            targets = [target[nf_pairs]
                       for target, nf_pairs in zip(targets, nonfixed_pairs)]

        dpack = DataPack.vstack(dpacks)
        target = np.concatenate(targets)
        self._learner.fit(dpack.data, target)
        self._fitted = True
        return self
Exemple #3
0
    def fit(self, dpacks, targets, nonfixed_pairs=None):
        # WIP select only the nonfixed pairs
        if nonfixed_pairs is not None:
            dpacks = [
                dpack.selected(nf_pairs)
                for dpack, nf_pairs in zip(dpacks, nonfixed_pairs)
            ]
            targets = [
                target[nf_pairs]
                for target, nf_pairs in zip(targets, nonfixed_pairs)
            ]

        dpack = DataPack.vstack(dpacks)
        target = np.concatenate(targets)
        self._learner.fit(dpack.data, target)
        self._fitted = True
        return self
Exemple #4
0
    train_packs = select_training(mpack, fold_dict, fold).values()
    parser.fit(train_packs,
               [x.target for x in train_packs])

    fold_predictions = []
    # decode each document separately
    test_pack = select_testing(mpack, fold_dict, fold)
    for onedoc, dpack in test_pack.items():
        print("decoding on file : ", onedoc, file=sys.stderr)
        dpack = parser.transform(dpack)
        prediction = prediction_to_triples(dpack)
        # print("Predictions: ", prediction)
        # record the prediction score
        scores.append(score_edges(dpack, prediction))
        # optional: save the predictions for further inspection
        fold_predictions.extend(prediction)

    # optional: write predictions for this fold
    output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold)
    print("writing: %s" % output_file, file=sys.stderr)
    write_predictions_output(DataPack.vstack(test_pack.values()),
                             fold_predictions, output_file)

report = EdgeReport(scores)

# a combined report provides scores for multiple configurations
# here, we are only using it for the single config
combined_report = CombinedReport(EdgeReport,
                                 {('maxent', 'mst'): report})
print(combined_report.table())
Exemple #5
0
def full_report(mpack, fold_dict, slices, metrics, adjust_pack=None):
    """Generate a report across a set of folds and configurations.

    This is a bit tricky as the idea is that we have to acculumate
    per-configuration results over the folds.

    Here configurations are just arbitrary strings.

    Parameters
    ----------
    mpack : TODO
        TODO

    fold_dict : TODO
        TODO

    slices : iterable of Slice
        Predictions for each configuration, for each fold.
        Folds should be contiguous for maximum efficiency.
        It may be worthwhile to generate this lazily.

    metrics : iterable of {'edges', 'edges_by_label', 'edus', 'cspans'}
        Set of selected metrics.
        For the RST corpus, 'cspans' should not be selected for evaluation
        on the intra, inter and root subsets until I find out how to restrict
        RSTTrees along DataPack.selected().

    adjust_pack : function from DataPack to DataPack, optional
        Function that modifies a DataPack, for example by picking out a
        subset of the pairings.


    Returns
    -------
    rpack : ReportPack
        Group of reports on a set of folds and configurations.

    TODO
    ----
    * [ ] refactor ReportPack so that it contains reports only for the
      metrics selected by the harness
    """
    if not mpack:
        raise ValueError("Can't report with empty multipack")
    edge_count = defaultdict(list)
    cspan_count = defaultdict(list)
    edge_lab_count = defaultdict(lambda: defaultdict(list))
    edu_reports = defaultdict(EduReport)
    dpack0 = mpack.values()[0]
    confusion = defaultdict(lambda: empty_confusion_matrix(dpack0))

    fold = None
    is_first_slice = True

    # avoid slicing the predictions if we can help it (slow)
    adjust_pack = adjust_pack or (lambda x: x)
    adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x)

    num_edges = {}
    for slc in slices:
        if is_first_slice and slc.fold is None:
            dpacks = [adjust_pack(x) for x in mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            is_first_slice = False
            num_edges[None] = len(fpack)
        elif is_first_slice or slc.fold != fold:
            f_mpack = select_testing(mpack, fold_dict, slc.fold)
            dpacks = [adjust_pack(x) for x in f_mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            fold = slc.fold
            num_edges[fold] = len(fpack)
            is_first_slice = False
        key = slc.configuration
        # accumulate scores
        predictions = adjust_predictions(fpack, slc.predictions)
        dpredictions = [
            adjust_predictions(dpack, slc.predictions) for dpack in dpacks
        ]
        # apply selected metrics
        # * on (dependency) edges
        if 'edges' in metrics:
            edge_count[key].append(score_edges(fpack, predictions))
        # * on constituency tree spans
        if 'cspans' in metrics:
            sc_cspans = score_cspans(dpacks, dpredictions)
            cspan_count[key].append(sc_cspans)
        # * on EDUs
        if 'edus' in metrics:
            edu_reports[key].add(score_edus(fpack, predictions))
        # * (confusion matrix)
        confusion[key] += build_confusion_matrix(fpack, predictions)

        if slc.enable_details:
            # * on edges, by label
            if 'edges_by_label' in metrics:
                details = score_edges_by_label(fpack, predictions)
                for label, lab_count in details:
                    edge_lab_count[key][label].append(lab_count)

    # build combined reports
    # * on edges
    if 'edges' in metrics:
        edge_report = CombinedReport(
            EdgeReport, {k: EdgeReport(v)
                         for k, v in edge_count.items()})
    else:
        edge_report = None

    # * on cspans
    if 'cspans' in metrics:
        cspan_report = CombinedReport(
            CSpanReport, {k: CSpanReport(v)
                          for k, v in cspan_count.items()})
    else:
        cspan_report = None

    # * on EDUs
    if 'edus' in metrics:
        edu_report = CombinedReport(EduReport, edu_reports)
    else:
        edu_report = None

    # * on edges, by label
    if 'edges_by_label' in metrics:
        edge_by_label_report = {}
        for key, counts in edge_lab_count.items():
            report = CombinedReport(LabelReport,
                                    {(label, ): LabelReport(vs)
                                     for label, vs in counts.items()})
            edge_by_label_report[key] = report
    else:
        edge_by_label_report = None

    return ReportPack(edge=edge_report,
                      cspan=cspan_report,
                      edge_by_label=edge_by_label_report,
                      edu=edu_report,
                      confusion=confusion,
                      confusion_labels=dpack0.labels,
                      num_edges=sum(num_edges.values()))
Exemple #6
0
    print("training ... ", file=sys.stderr)
    # learn a model for the training data for this fold
    train_packs = select_training(mpack, fold_dict, fold).values()
    parser.fit(train_packs, [x.target for x in train_packs])

    fold_predictions = []
    # decode each document separately
    test_pack = select_testing(mpack, fold_dict, fold)
    for onedoc, dpack in test_pack.items():
        print("decoding on file : ", onedoc, file=sys.stderr)
        dpack = parser.transform(dpack)
        prediction = prediction_to_triples(dpack)
        # print("Predictions: ", prediction)
        # record the prediction score
        scores.append(score_edges(dpack, prediction))
        # optional: save the predictions for further inspection
        fold_predictions.extend(prediction)

    # optional: write predictions for this fold
    output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold)
    print("writing: %s" % output_file, file=sys.stderr)
    write_predictions_output(DataPack.vstack(test_pack.values()),
                             fold_predictions, output_file)

report = EdgeReport(scores)

# a combined report provides scores for multiple configurations
# here, we are only using it for the single config
combined_report = CombinedReport(EdgeReport, {('maxent', 'mst'): report})
print(combined_report.table())
Exemple #7
0
def full_report(mpack, fold_dict, slices, metrics,
                adjust_pack=None):
    """Generate a report across a set of folds and configurations.

    This is a bit tricky as the idea is that we have to acculumate
    per-configuration results over the folds.

    Here configurations are just arbitrary strings.

    Parameters
    ----------
    mpack: TODO
        TODO
    fold_dict: TODO
        TODO
    slices: iterable of Slice
        Predictions for each configuration, for each fold.
        Folds should be contiguous for maximum efficiency.
        It may be worthwhile to generate this lazily.
    metrics: iterable of {'edges', 'edges_by_label', 'edus', 'cspans'}
        Set of selected metrics.
        For the RST corpus, 'cspans' should not be selected for evaluation
        on the intra, inter and root subsets until I find out how to restrict
        RSTTrees along DataPack.selected().
    adjust_pack: function from DataPack to DataPack, optional
        Function that modifies a DataPack, for example by picking out a
        subset of the pairings.


    Returns
    -------
    rpack: ReportPack
        Group of reports on a set of folds and configurations.

    TODO
    ----
    * [ ] refactor ReportPack so that it contains reports only for the
      metrics selected by the harness
    """
    if not mpack:
        raise ValueError("Can't report with empty multipack")
    edge_count = defaultdict(list)
    cspan_count = defaultdict(list)
    edge_lab_count = defaultdict(lambda: defaultdict(list))
    edu_reports = defaultdict(EduReport)
    dpack0 = mpack.values()[0]
    confusion = defaultdict(lambda: empty_confusion_matrix(dpack0))

    fold = None
    is_first_slice = True

    # avoid slicing the predictions if we can help it (slow)
    adjust_pack = adjust_pack or (lambda x: x)
    adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x)

    num_edges = {}
    for slc in slices:
        if is_first_slice and slc.fold is None:
            dpacks = [adjust_pack(x) for x in mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            is_first_slice = False
            num_edges[None] = len(fpack)
        elif is_first_slice or slc.fold != fold:
            f_mpack = select_testing(mpack, fold_dict, slc.fold)
            dpacks = [adjust_pack(x) for x in f_mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            fold = slc.fold
            num_edges[fold] = len(fpack)
            is_first_slice = False
        key = slc.configuration
        # accumulate scores
        predictions = adjust_predictions(fpack, slc.predictions)
        dpredictions = [adjust_predictions(dpack, slc.predictions)
                        for dpack in dpacks]
        # apply selected metrics
        # * on (dependency) edges
        if 'edges' in metrics:
            edge_count[key].append(score_edges(fpack, predictions))
        # * on constituency tree spans
        if 'cspans' in metrics:
            sc_cspans = score_cspans(dpacks, dpredictions)
            cspan_count[key].append(sc_cspans)
        # * on EDUs
        if 'edus' in metrics:
            edu_reports[key].add(score_edus(fpack, predictions))
        # * (confusion matrix)
        confusion[key] += build_confusion_matrix(fpack, predictions)

        if slc.enable_details:
            # * on edges, by label
            if 'edges_by_label' in metrics:
                details = score_edges_by_label(fpack, predictions)
                for label, lab_count in details:
                    edge_lab_count[key][label].append(lab_count)

    # build combined reports
    # * on edges
    if 'edges' in metrics:
        edge_report = CombinedReport(EdgeReport,
                                     {k: EdgeReport(v)
                                      for k, v in edge_count.items()})
    else:
        edge_report = None

    # * on cspans
    if 'cspans' in metrics:
        cspan_report = CombinedReport(CSpanReport,
                                      {k: CSpanReport(v)
                                       for k, v in cspan_count.items()})
    else:
        cspan_report = None

    # * on EDUs
    if 'edus' in metrics:
        edu_report = CombinedReport(EduReport, edu_reports)
    else:
        edu_report = None

    # * on edges, by label
    if 'edges_by_label' in metrics:
        edge_by_label_report = {}
        for key, counts in edge_lab_count.items():
            report = CombinedReport(LabelReport,
                                    {(label,): LabelReport(vs)
                                     for label, vs in counts.items()})
            edge_by_label_report[key] = report
    else:
        edge_by_label_report = None

    return ReportPack(edge=edge_report,
                      cspan=cspan_report,
                      edge_by_label=edge_by_label_report,
                      edu=edu_report,
                      confusion=confusion,
                      confusion_labels=dpack0.labels,
                      num_edges=sum(num_edges.values()))
Exemple #8
0
def full_report(mpack, fold_dict, slices,
                adjust_pack=None):
    """
    Generate a report across a set of folds and configurations.

    This is a bit tricky as the the idea is that we have to acculumate
    per-configuration results over the folds.

    Here configurations are just arbitrary strings

    :param slices: the predictions for each configuration, for each fold.
                   Folds should be contiguous for maximum efficiency.
                   It may be worthwhile to generate this lazily
    :type slices: iterable(:py:class:`Slice`)

    :param adjust_pack: (optional) function that modifies a DataPack, for
                        example by picking out a subset of the pairings.
    :type adjust_pack: (DataPack -> DataPack) or None
    """
    if not mpack:
        raise ValueError("Can't report with empty multipack")
    edge_count = defaultdict(list)
    edge_lab_count = defaultdict(lambda: defaultdict(list))
    edu_reports = defaultdict(EduReport)
    dpack0 = mpack.values()[0]
    confusion = defaultdict(lambda: empty_confusion_matrix(dpack0))

    fold = None
    is_first_slice = True

    # avoid slicing the predictions if we can help it (slow)
    adjust_pack = adjust_pack or (lambda x: x)
    adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x)

    num_edges = {}
    for slc in slices:
        if is_first_slice and slc.fold is None:
            fpack = DataPack.vstack([adjust_pack(x)
                                     for x in mpack.values()])
            is_first_slice = False
            num_edges[None] = len(fpack)
        elif is_first_slice or slc.fold != fold:
            f_mpack = select_testing(mpack, fold_dict, slc.fold)
            fpack = DataPack.vstack([adjust_pack(x)
                                     for x in f_mpack.values()])
            fold = slc.fold
            num_edges[fold] = len(fpack)
            is_first_slice = False
        key = slc.configuration
        # accumulate scores
        predictions = adjust_predictions(fpack, slc.predictions)
        edge_count[key].append(score_edges(fpack, predictions))
        edu_reports[key].add(score_edus(fpack, predictions))
        confusion[key] += build_confusion_matrix(fpack, predictions)
        if slc.enable_details:
            details = score_edges_by_label(fpack, predictions)
            for label, lab_count in details:
                edge_lab_count[key][label].append(lab_count)

    edge_report = CombinedReport(EdgeReport,
                                 {k: EdgeReport(v)
                                  for k, v in edge_count.items()})
    # combine
    edge_by_label_report = {}
    for key, counts in edge_lab_count.items():
        report = CombinedReport(LabelReport,
                                {(label,): LabelReport(vs)
                                 for label, vs in counts.items()})
        edge_by_label_report[key] = report

    return ReportPack(edge=edge_report,
                      edge_by_label=edge_by_label_report or None,
                      edu=CombinedReport(EduReport, edu_reports),
                      confusion=confusion,
                      confusion_labels=dpack0.labels,
                      num_edges=sum(num_edges.values()))
Exemple #9
0
 def fit(self, dpacks, targets):
     dpack = DataPack.vstack(dpacks)
     target = np.concatenate(targets)
     self._learner.fit(dpack.data, target)
     self._fitted = True
     return self