def fit(self, dpacks, targets): dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._labels = [dpack.get_label(x) for x in self._learner.classes_] self._fitted = True return self
def fit(self, dpacks, targets, nonfixed_pairs=None): # WIP select only the nonfixed pairs if nonfixed_pairs is not None: dpacks = [dpack.selected(nf_pairs) for dpack, nf_pairs in zip(dpacks, nonfixed_pairs)] targets = [target[nf_pairs] for target, nf_pairs in zip(targets, nonfixed_pairs)] dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._fitted = True return self
def fit(self, dpacks, targets, nonfixed_pairs=None): # WIP select only the nonfixed pairs if nonfixed_pairs is not None: dpacks = [ dpack.selected(nf_pairs) for dpack, nf_pairs in zip(dpacks, nonfixed_pairs) ] targets = [ target[nf_pairs] for target, nf_pairs in zip(targets, nonfixed_pairs) ] dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._fitted = True return self
train_packs = select_training(mpack, fold_dict, fold).values() parser.fit(train_packs, [x.target for x in train_packs]) fold_predictions = [] # decode each document separately test_pack = select_testing(mpack, fold_dict, fold) for onedoc, dpack in test_pack.items(): print("decoding on file : ", onedoc, file=sys.stderr) dpack = parser.transform(dpack) prediction = prediction_to_triples(dpack) # print("Predictions: ", prediction) # record the prediction score scores.append(score_edges(dpack, prediction)) # optional: save the predictions for further inspection fold_predictions.extend(prediction) # optional: write predictions for this fold output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold) print("writing: %s" % output_file, file=sys.stderr) write_predictions_output(DataPack.vstack(test_pack.values()), fold_predictions, output_file) report = EdgeReport(scores) # a combined report provides scores for multiple configurations # here, we are only using it for the single config combined_report = CombinedReport(EdgeReport, {('maxent', 'mst'): report}) print(combined_report.table())
def full_report(mpack, fold_dict, slices, metrics, adjust_pack=None): """Generate a report across a set of folds and configurations. This is a bit tricky as the idea is that we have to acculumate per-configuration results over the folds. Here configurations are just arbitrary strings. Parameters ---------- mpack : TODO TODO fold_dict : TODO TODO slices : iterable of Slice Predictions for each configuration, for each fold. Folds should be contiguous for maximum efficiency. It may be worthwhile to generate this lazily. metrics : iterable of {'edges', 'edges_by_label', 'edus', 'cspans'} Set of selected metrics. For the RST corpus, 'cspans' should not be selected for evaluation on the intra, inter and root subsets until I find out how to restrict RSTTrees along DataPack.selected(). adjust_pack : function from DataPack to DataPack, optional Function that modifies a DataPack, for example by picking out a subset of the pairings. Returns ------- rpack : ReportPack Group of reports on a set of folds and configurations. TODO ---- * [ ] refactor ReportPack so that it contains reports only for the metrics selected by the harness """ if not mpack: raise ValueError("Can't report with empty multipack") edge_count = defaultdict(list) cspan_count = defaultdict(list) edge_lab_count = defaultdict(lambda: defaultdict(list)) edu_reports = defaultdict(EduReport) dpack0 = mpack.values()[0] confusion = defaultdict(lambda: empty_confusion_matrix(dpack0)) fold = None is_first_slice = True # avoid slicing the predictions if we can help it (slow) adjust_pack = adjust_pack or (lambda x: x) adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x) num_edges = {} for slc in slices: if is_first_slice and slc.fold is None: dpacks = [adjust_pack(x) for x in mpack.values()] # WIP fpack = DataPack.vstack(dpacks) is_first_slice = False num_edges[None] = len(fpack) elif is_first_slice or slc.fold != fold: f_mpack = select_testing(mpack, fold_dict, slc.fold) dpacks = [adjust_pack(x) for x in f_mpack.values()] # WIP fpack = DataPack.vstack(dpacks) fold = slc.fold num_edges[fold] = len(fpack) is_first_slice = False key = slc.configuration # accumulate scores predictions = adjust_predictions(fpack, slc.predictions) dpredictions = [ adjust_predictions(dpack, slc.predictions) for dpack in dpacks ] # apply selected metrics # * on (dependency) edges if 'edges' in metrics: edge_count[key].append(score_edges(fpack, predictions)) # * on constituency tree spans if 'cspans' in metrics: sc_cspans = score_cspans(dpacks, dpredictions) cspan_count[key].append(sc_cspans) # * on EDUs if 'edus' in metrics: edu_reports[key].add(score_edus(fpack, predictions)) # * (confusion matrix) confusion[key] += build_confusion_matrix(fpack, predictions) if slc.enable_details: # * on edges, by label if 'edges_by_label' in metrics: details = score_edges_by_label(fpack, predictions) for label, lab_count in details: edge_lab_count[key][label].append(lab_count) # build combined reports # * on edges if 'edges' in metrics: edge_report = CombinedReport( EdgeReport, {k: EdgeReport(v) for k, v in edge_count.items()}) else: edge_report = None # * on cspans if 'cspans' in metrics: cspan_report = CombinedReport( CSpanReport, {k: CSpanReport(v) for k, v in cspan_count.items()}) else: cspan_report = None # * on EDUs if 'edus' in metrics: edu_report = CombinedReport(EduReport, edu_reports) else: edu_report = None # * on edges, by label if 'edges_by_label' in metrics: edge_by_label_report = {} for key, counts in edge_lab_count.items(): report = CombinedReport(LabelReport, {(label, ): LabelReport(vs) for label, vs in counts.items()}) edge_by_label_report[key] = report else: edge_by_label_report = None return ReportPack(edge=edge_report, cspan=cspan_report, edge_by_label=edge_by_label_report, edu=edu_report, confusion=confusion, confusion_labels=dpack0.labels, num_edges=sum(num_edges.values()))
print("training ... ", file=sys.stderr) # learn a model for the training data for this fold train_packs = select_training(mpack, fold_dict, fold).values() parser.fit(train_packs, [x.target for x in train_packs]) fold_predictions = [] # decode each document separately test_pack = select_testing(mpack, fold_dict, fold) for onedoc, dpack in test_pack.items(): print("decoding on file : ", onedoc, file=sys.stderr) dpack = parser.transform(dpack) prediction = prediction_to_triples(dpack) # print("Predictions: ", prediction) # record the prediction score scores.append(score_edges(dpack, prediction)) # optional: save the predictions for further inspection fold_predictions.extend(prediction) # optional: write predictions for this fold output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold) print("writing: %s" % output_file, file=sys.stderr) write_predictions_output(DataPack.vstack(test_pack.values()), fold_predictions, output_file) report = EdgeReport(scores) # a combined report provides scores for multiple configurations # here, we are only using it for the single config combined_report = CombinedReport(EdgeReport, {('maxent', 'mst'): report}) print(combined_report.table())
def full_report(mpack, fold_dict, slices, metrics, adjust_pack=None): """Generate a report across a set of folds and configurations. This is a bit tricky as the idea is that we have to acculumate per-configuration results over the folds. Here configurations are just arbitrary strings. Parameters ---------- mpack: TODO TODO fold_dict: TODO TODO slices: iterable of Slice Predictions for each configuration, for each fold. Folds should be contiguous for maximum efficiency. It may be worthwhile to generate this lazily. metrics: iterable of {'edges', 'edges_by_label', 'edus', 'cspans'} Set of selected metrics. For the RST corpus, 'cspans' should not be selected for evaluation on the intra, inter and root subsets until I find out how to restrict RSTTrees along DataPack.selected(). adjust_pack: function from DataPack to DataPack, optional Function that modifies a DataPack, for example by picking out a subset of the pairings. Returns ------- rpack: ReportPack Group of reports on a set of folds and configurations. TODO ---- * [ ] refactor ReportPack so that it contains reports only for the metrics selected by the harness """ if not mpack: raise ValueError("Can't report with empty multipack") edge_count = defaultdict(list) cspan_count = defaultdict(list) edge_lab_count = defaultdict(lambda: defaultdict(list)) edu_reports = defaultdict(EduReport) dpack0 = mpack.values()[0] confusion = defaultdict(lambda: empty_confusion_matrix(dpack0)) fold = None is_first_slice = True # avoid slicing the predictions if we can help it (slow) adjust_pack = adjust_pack or (lambda x: x) adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x) num_edges = {} for slc in slices: if is_first_slice and slc.fold is None: dpacks = [adjust_pack(x) for x in mpack.values()] # WIP fpack = DataPack.vstack(dpacks) is_first_slice = False num_edges[None] = len(fpack) elif is_first_slice or slc.fold != fold: f_mpack = select_testing(mpack, fold_dict, slc.fold) dpacks = [adjust_pack(x) for x in f_mpack.values()] # WIP fpack = DataPack.vstack(dpacks) fold = slc.fold num_edges[fold] = len(fpack) is_first_slice = False key = slc.configuration # accumulate scores predictions = adjust_predictions(fpack, slc.predictions) dpredictions = [adjust_predictions(dpack, slc.predictions) for dpack in dpacks] # apply selected metrics # * on (dependency) edges if 'edges' in metrics: edge_count[key].append(score_edges(fpack, predictions)) # * on constituency tree spans if 'cspans' in metrics: sc_cspans = score_cspans(dpacks, dpredictions) cspan_count[key].append(sc_cspans) # * on EDUs if 'edus' in metrics: edu_reports[key].add(score_edus(fpack, predictions)) # * (confusion matrix) confusion[key] += build_confusion_matrix(fpack, predictions) if slc.enable_details: # * on edges, by label if 'edges_by_label' in metrics: details = score_edges_by_label(fpack, predictions) for label, lab_count in details: edge_lab_count[key][label].append(lab_count) # build combined reports # * on edges if 'edges' in metrics: edge_report = CombinedReport(EdgeReport, {k: EdgeReport(v) for k, v in edge_count.items()}) else: edge_report = None # * on cspans if 'cspans' in metrics: cspan_report = CombinedReport(CSpanReport, {k: CSpanReport(v) for k, v in cspan_count.items()}) else: cspan_report = None # * on EDUs if 'edus' in metrics: edu_report = CombinedReport(EduReport, edu_reports) else: edu_report = None # * on edges, by label if 'edges_by_label' in metrics: edge_by_label_report = {} for key, counts in edge_lab_count.items(): report = CombinedReport(LabelReport, {(label,): LabelReport(vs) for label, vs in counts.items()}) edge_by_label_report[key] = report else: edge_by_label_report = None return ReportPack(edge=edge_report, cspan=cspan_report, edge_by_label=edge_by_label_report, edu=edu_report, confusion=confusion, confusion_labels=dpack0.labels, num_edges=sum(num_edges.values()))
def full_report(mpack, fold_dict, slices, adjust_pack=None): """ Generate a report across a set of folds and configurations. This is a bit tricky as the the idea is that we have to acculumate per-configuration results over the folds. Here configurations are just arbitrary strings :param slices: the predictions for each configuration, for each fold. Folds should be contiguous for maximum efficiency. It may be worthwhile to generate this lazily :type slices: iterable(:py:class:`Slice`) :param adjust_pack: (optional) function that modifies a DataPack, for example by picking out a subset of the pairings. :type adjust_pack: (DataPack -> DataPack) or None """ if not mpack: raise ValueError("Can't report with empty multipack") edge_count = defaultdict(list) edge_lab_count = defaultdict(lambda: defaultdict(list)) edu_reports = defaultdict(EduReport) dpack0 = mpack.values()[0] confusion = defaultdict(lambda: empty_confusion_matrix(dpack0)) fold = None is_first_slice = True # avoid slicing the predictions if we can help it (slow) adjust_pack = adjust_pack or (lambda x: x) adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x) num_edges = {} for slc in slices: if is_first_slice and slc.fold is None: fpack = DataPack.vstack([adjust_pack(x) for x in mpack.values()]) is_first_slice = False num_edges[None] = len(fpack) elif is_first_slice or slc.fold != fold: f_mpack = select_testing(mpack, fold_dict, slc.fold) fpack = DataPack.vstack([adjust_pack(x) for x in f_mpack.values()]) fold = slc.fold num_edges[fold] = len(fpack) is_first_slice = False key = slc.configuration # accumulate scores predictions = adjust_predictions(fpack, slc.predictions) edge_count[key].append(score_edges(fpack, predictions)) edu_reports[key].add(score_edus(fpack, predictions)) confusion[key] += build_confusion_matrix(fpack, predictions) if slc.enable_details: details = score_edges_by_label(fpack, predictions) for label, lab_count in details: edge_lab_count[key][label].append(lab_count) edge_report = CombinedReport(EdgeReport, {k: EdgeReport(v) for k, v in edge_count.items()}) # combine edge_by_label_report = {} for key, counts in edge_lab_count.items(): report = CombinedReport(LabelReport, {(label,): LabelReport(vs) for label, vs in counts.items()}) edge_by_label_report[key] = report return ReportPack(edge=edge_report, edge_by_label=edge_by_label_report or None, edu=CombinedReport(EduReport, edu_reports), confusion=confusion, confusion_labels=dpack0.labels, num_edges=sum(num_edges.values()))
def fit(self, dpacks, targets): dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._fitted = True return self