def for_intra(dpack, target): """Adapt a datapack to intrasentential decoding. An intrasentential datapack is almost identical to its original, except that we set the label for each ('ROOT', edu) pairing to 'ROOT' if that edu is a subgrouping head (if it has no parents other than 'ROOT' within its subgrouping). This should be done before either `for_labelling` or `for_attachment` Returns ------- dpack : DataPack target : array(int) """ # map EDUs to subgroup ids ; intra = pairs of EDUs with same subgroup id grp = {e.id: e.subgrouping for e in dpack.edus} # find all edus that have intra incoming edges (to rule out) unrelated = dpack.label_number(UNRELATED) intra_tgts = defaultdict(set) for i, (edu1, edu2) in enumerate(dpack.pairings): if (grp[edu1.id] == grp[edu2.id] and target[i] != unrelated): # edu2 has an incoming relation => not an (intra) root intra_tgts[grp[edu2.id]].add(edu2.id) # pick out the (fakeroot, edu) pairs where edu does not have # incoming intra edges all_heads = [i for i, (edu1, edu2) in enumerate(dpack.pairings) if (edu1.id == FAKE_ROOT_ID and edu2.id not in intra_tgts[grp[edu2.id]])] # NEW pick out the original inter-sentential links, for removal inter_links = [i for i, (edu1, edu2) in enumerate(dpack.pairings) if (edu1.id != FAKE_ROOT_ID and grp[edu1.id] != grp[edu2.id] and target[i] != unrelated)] # update datapack and target accordingly new_target = np.copy(dpack.target) new_target[all_heads] = dpack.label_number('ROOT') new_target[inter_links] = unrelated # NEW # WIP ctarget new_ctarget = {grp_name: ctgt for grp_name, ctgt in dpack.ctarget.items()} # FIXME replace each ctgt with the list of intra-sentential # RST (sub)trees # end WIP ctarget dpack = DataPack(edus=dpack.edus, pairings=dpack.pairings, data=dpack.data, target=new_target, ctarget=new_ctarget, labels=dpack.labels, vocab=dpack.vocab, graph=dpack.graph) target = np.copy(target) target[all_heads] = dpack.label_number('ROOT') target[inter_links] = unrelated # NEW return dpack, target
def _dpack_1(): "example datapack for testing" # pylint: disable=invalid-name a1 = EDU('a1', '', 0, 0, 'a', 's1') a2 = EDU('a2', '', 0, 0, 'a', 's1') a3 = EDU('a3', '', 0, 0, 'a', 's1') b1 = EDU('b1', '', 0, 0, 'a', 's2') b2 = EDU('b2', '', 0, 0, 'a', 's2') b3 = EDU('b3', '', 0, 0, 'a', 's2') # pylint: enable=invalid-name orig_classes = ['__UNK__', 'UNRELATED', 'ROOT', 'x'] dpack = DataPack.load( edus=[a1, a2, a3, b1, b2, b3], pairings=[(FAKE_ROOT, a1), (FAKE_ROOT, a2), (FAKE_ROOT, a3), (a1, a2), (a1, a3), (a2, a3), (a2, a1), (a3, a1), (a3, a2), (FAKE_ROOT, b1), (FAKE_ROOT, b2), (FAKE_ROOT, b3), (b1, b2), (b1, b3), (b2, b3), (b2, b1), (b3, b1), (b3, b2), (a1, b1)], data=scipy.sparse.csr_matrix([[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]]), target=np.array( [2, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 2, 3, 1, 3, 1, 1, 1, 3]), ctarget=dict(), # WIP labels=orig_classes, vocab=None) return dpack
def fit(self, dpacks, targets): dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._labels = [dpack.get_label(x) for x in self._learner.classes_] self._fitted = True return self
def for_intra(dpack, target): """Adapt a datapack to intrasentential decoding. An intrasenential datapack is almost identical to its original, except that we set the label for each ('ROOT', edu) pairing to 'ROOT' if that edu is a subgrouping head (if it has no parents other than than 'ROOT' within its subgrouping). This should be done before either `for_labelling` or `for_attachment` Returns ------- dpack: DataPack target: array(int) """ # find all edus that have intra incoming edges (to rule out) unrelated = dpack.label_number(UNRELATED) intra_tgts = defaultdict(set) for i, (edu1, edu2) in enumerate(dpack.pairings): subg = edu2.subgrouping if (edu1.subgrouping == subg and target[i] != unrelated): intra_tgts[subg].add(edu2.id) # pick out the (fakeroot, edu) pairs where edu does not have # incoming intra edges all_heads = [] for i, (edu1, edu2) in enumerate(dpack.pairings): subg = edu2.subgrouping if (edu1.id == FAKE_ROOT_ID and edu2.id not in intra_tgts[subg]): all_heads.append(i) # update datapack and target accordingly new_target = np.copy(dpack.target) new_target[all_heads] = dpack.label_number('ROOT') dpack = DataPack(edus=dpack.edus, pairings=dpack.pairings, data=dpack.data, target=new_target, labels=dpack.labels, vocab=dpack.vocab, graph=dpack.graph) target = np.copy(target) target[all_heads] = dpack.label_number('ROOT') return dpack, target
def fit(self, dpacks, targets, nonfixed_pairs=None): # WIP select only the nonfixed pairs if nonfixed_pairs is not None: dpacks = [dpack.selected(nf_pairs) for dpack, nf_pairs in zip(dpacks, nonfixed_pairs)] targets = [target[nf_pairs] for target, nf_pairs in zip(targets, nonfixed_pairs)] dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._fitted = True return self
def _dpack_1(): "example datapack for testing" # pylint: disable=invalid-name a1 = EDU('a1', '', 0, 0, 'a', 's1') a2 = EDU('a2', '', 0, 0, 'a', 's1') a3 = EDU('a3', '', 0, 0, 'a', 's1') b1 = EDU('b1', '', 0, 0, 'a', 's2') b2 = EDU('b2', '', 0, 0, 'a', 's2') b3 = EDU('b3', '', 0, 0, 'a', 's2') # pylint: enable=invalid-name orig_classes = ['__UNK__', 'UNRELATED', 'ROOT', 'x'] dpack = DataPack.load(edus=[a1, a2, a3, b1, b2, b3], pairings=[(FAKE_ROOT, a1), (FAKE_ROOT, a2), (FAKE_ROOT, a3), (a1, a2), (a1, a3), (a2, a3), (a2, a1), (a3, a1), (a3, a2), (FAKE_ROOT, b1), (FAKE_ROOT, b2), (FAKE_ROOT, b3), (b1, b2), (b1, b3), (b2, b3), (b2, b1), (b3, b1), (b3, b2), (a1, b1)], data=scipy.sparse.csr_matrix([[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]]), target=np.array([2, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 2, 3, 1, 3, 1, 1, 1, 3]), ctarget=dict(), # WIP labels=orig_classes, vocab=None) return dpack
def fit(self, dpacks, targets, nonfixed_pairs=None): # WIP select only the nonfixed pairs if nonfixed_pairs is not None: dpacks = [ dpack.selected(nf_pairs) for dpack, nf_pairs in zip(dpacks, nonfixed_pairs) ] targets = [ target[nf_pairs] for target, nf_pairs in zip(targets, nonfixed_pairs) ] dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._fitted = True return self
train_packs = select_training(mpack, fold_dict, fold).values() parser.fit(train_packs, [x.target for x in train_packs]) fold_predictions = [] # decode each document separately test_pack = select_testing(mpack, fold_dict, fold) for onedoc, dpack in test_pack.items(): print("decoding on file : ", onedoc, file=sys.stderr) dpack = parser.transform(dpack) prediction = prediction_to_triples(dpack) # print("Predictions: ", prediction) # record the prediction score scores.append(score_edges(dpack, prediction)) # optional: save the predictions for further inspection fold_predictions.extend(prediction) # optional: write predictions for this fold output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold) print("writing: %s" % output_file, file=sys.stderr) write_predictions_output(DataPack.vstack(test_pack.values()), fold_predictions, output_file) report = EdgeReport(scores) # a combined report provides scores for multiple configurations # here, we are only using it for the single config combined_report = CombinedReport(EdgeReport, {('maxent', 'mst'): report}) print(combined_report.table())
def full_report(mpack, fold_dict, slices, metrics, adjust_pack=None): """Generate a report across a set of folds and configurations. This is a bit tricky as the idea is that we have to acculumate per-configuration results over the folds. Here configurations are just arbitrary strings. Parameters ---------- mpack : TODO TODO fold_dict : TODO TODO slices : iterable of Slice Predictions for each configuration, for each fold. Folds should be contiguous for maximum efficiency. It may be worthwhile to generate this lazily. metrics : iterable of {'edges', 'edges_by_label', 'edus', 'cspans'} Set of selected metrics. For the RST corpus, 'cspans' should not be selected for evaluation on the intra, inter and root subsets until I find out how to restrict RSTTrees along DataPack.selected(). adjust_pack : function from DataPack to DataPack, optional Function that modifies a DataPack, for example by picking out a subset of the pairings. Returns ------- rpack : ReportPack Group of reports on a set of folds and configurations. TODO ---- * [ ] refactor ReportPack so that it contains reports only for the metrics selected by the harness """ if not mpack: raise ValueError("Can't report with empty multipack") edge_count = defaultdict(list) cspan_count = defaultdict(list) edge_lab_count = defaultdict(lambda: defaultdict(list)) edu_reports = defaultdict(EduReport) dpack0 = mpack.values()[0] confusion = defaultdict(lambda: empty_confusion_matrix(dpack0)) fold = None is_first_slice = True # avoid slicing the predictions if we can help it (slow) adjust_pack = adjust_pack or (lambda x: x) adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x) num_edges = {} for slc in slices: if is_first_slice and slc.fold is None: dpacks = [adjust_pack(x) for x in mpack.values()] # WIP fpack = DataPack.vstack(dpacks) is_first_slice = False num_edges[None] = len(fpack) elif is_first_slice or slc.fold != fold: f_mpack = select_testing(mpack, fold_dict, slc.fold) dpacks = [adjust_pack(x) for x in f_mpack.values()] # WIP fpack = DataPack.vstack(dpacks) fold = slc.fold num_edges[fold] = len(fpack) is_first_slice = False key = slc.configuration # accumulate scores predictions = adjust_predictions(fpack, slc.predictions) dpredictions = [ adjust_predictions(dpack, slc.predictions) for dpack in dpacks ] # apply selected metrics # * on (dependency) edges if 'edges' in metrics: edge_count[key].append(score_edges(fpack, predictions)) # * on constituency tree spans if 'cspans' in metrics: sc_cspans = score_cspans(dpacks, dpredictions) cspan_count[key].append(sc_cspans) # * on EDUs if 'edus' in metrics: edu_reports[key].add(score_edus(fpack, predictions)) # * (confusion matrix) confusion[key] += build_confusion_matrix(fpack, predictions) if slc.enable_details: # * on edges, by label if 'edges_by_label' in metrics: details = score_edges_by_label(fpack, predictions) for label, lab_count in details: edge_lab_count[key][label].append(lab_count) # build combined reports # * on edges if 'edges' in metrics: edge_report = CombinedReport( EdgeReport, {k: EdgeReport(v) for k, v in edge_count.items()}) else: edge_report = None # * on cspans if 'cspans' in metrics: cspan_report = CombinedReport( CSpanReport, {k: CSpanReport(v) for k, v in cspan_count.items()}) else: cspan_report = None # * on EDUs if 'edus' in metrics: edu_report = CombinedReport(EduReport, edu_reports) else: edu_report = None # * on edges, by label if 'edges_by_label' in metrics: edge_by_label_report = {} for key, counts in edge_lab_count.items(): report = CombinedReport(LabelReport, {(label, ): LabelReport(vs) for label, vs in counts.items()}) edge_by_label_report[key] = report else: edge_by_label_report = None return ReportPack(edge=edge_report, cspan=cspan_report, edge_by_label=edge_by_label_report, edu=edu_report, confusion=confusion, confusion_labels=dpack0.labels, num_edges=sum(num_edges.values()))
print("training ... ", file=sys.stderr) # learn a model for the training data for this fold train_packs = select_training(mpack, fold_dict, fold).values() parser.fit(train_packs, [x.target for x in train_packs]) fold_predictions = [] # decode each document separately test_pack = select_testing(mpack, fold_dict, fold) for onedoc, dpack in test_pack.items(): print("decoding on file : ", onedoc, file=sys.stderr) dpack = parser.transform(dpack) prediction = prediction_to_triples(dpack) # print("Predictions: ", prediction) # record the prediction score scores.append(score_edges(dpack, prediction)) # optional: save the predictions for further inspection fold_predictions.extend(prediction) # optional: write predictions for this fold output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold) print("writing: %s" % output_file, file=sys.stderr) write_predictions_output(DataPack.vstack(test_pack.values()), fold_predictions, output_file) report = EdgeReport(scores) # a combined report provides scores for multiple configurations # here, we are only using it for the single config combined_report = CombinedReport(EdgeReport, {('maxent', 'mst'): report}) print(combined_report.table())
def full_report(mpack, fold_dict, slices, metrics, adjust_pack=None): """Generate a report across a set of folds and configurations. This is a bit tricky as the idea is that we have to acculumate per-configuration results over the folds. Here configurations are just arbitrary strings. Parameters ---------- mpack: TODO TODO fold_dict: TODO TODO slices: iterable of Slice Predictions for each configuration, for each fold. Folds should be contiguous for maximum efficiency. It may be worthwhile to generate this lazily. metrics: iterable of {'edges', 'edges_by_label', 'edus', 'cspans'} Set of selected metrics. For the RST corpus, 'cspans' should not be selected for evaluation on the intra, inter and root subsets until I find out how to restrict RSTTrees along DataPack.selected(). adjust_pack: function from DataPack to DataPack, optional Function that modifies a DataPack, for example by picking out a subset of the pairings. Returns ------- rpack: ReportPack Group of reports on a set of folds and configurations. TODO ---- * [ ] refactor ReportPack so that it contains reports only for the metrics selected by the harness """ if not mpack: raise ValueError("Can't report with empty multipack") edge_count = defaultdict(list) cspan_count = defaultdict(list) edge_lab_count = defaultdict(lambda: defaultdict(list)) edu_reports = defaultdict(EduReport) dpack0 = mpack.values()[0] confusion = defaultdict(lambda: empty_confusion_matrix(dpack0)) fold = None is_first_slice = True # avoid slicing the predictions if we can help it (slow) adjust_pack = adjust_pack or (lambda x: x) adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x) num_edges = {} for slc in slices: if is_first_slice and slc.fold is None: dpacks = [adjust_pack(x) for x in mpack.values()] # WIP fpack = DataPack.vstack(dpacks) is_first_slice = False num_edges[None] = len(fpack) elif is_first_slice or slc.fold != fold: f_mpack = select_testing(mpack, fold_dict, slc.fold) dpacks = [adjust_pack(x) for x in f_mpack.values()] # WIP fpack = DataPack.vstack(dpacks) fold = slc.fold num_edges[fold] = len(fpack) is_first_slice = False key = slc.configuration # accumulate scores predictions = adjust_predictions(fpack, slc.predictions) dpredictions = [adjust_predictions(dpack, slc.predictions) for dpack in dpacks] # apply selected metrics # * on (dependency) edges if 'edges' in metrics: edge_count[key].append(score_edges(fpack, predictions)) # * on constituency tree spans if 'cspans' in metrics: sc_cspans = score_cspans(dpacks, dpredictions) cspan_count[key].append(sc_cspans) # * on EDUs if 'edus' in metrics: edu_reports[key].add(score_edus(fpack, predictions)) # * (confusion matrix) confusion[key] += build_confusion_matrix(fpack, predictions) if slc.enable_details: # * on edges, by label if 'edges_by_label' in metrics: details = score_edges_by_label(fpack, predictions) for label, lab_count in details: edge_lab_count[key][label].append(lab_count) # build combined reports # * on edges if 'edges' in metrics: edge_report = CombinedReport(EdgeReport, {k: EdgeReport(v) for k, v in edge_count.items()}) else: edge_report = None # * on cspans if 'cspans' in metrics: cspan_report = CombinedReport(CSpanReport, {k: CSpanReport(v) for k, v in cspan_count.items()}) else: cspan_report = None # * on EDUs if 'edus' in metrics: edu_report = CombinedReport(EduReport, edu_reports) else: edu_report = None # * on edges, by label if 'edges_by_label' in metrics: edge_by_label_report = {} for key, counts in edge_lab_count.items(): report = CombinedReport(LabelReport, {(label,): LabelReport(vs) for label, vs in counts.items()}) edge_by_label_report[key] = report else: edge_by_label_report = None return ReportPack(edge=edge_report, cspan=cspan_report, edge_by_label=edge_by_label_report, edu=edu_report, confusion=confusion, confusion_labels=dpack0.labels, num_edges=sum(num_edges.values()))
def full_report(mpack, fold_dict, slices, adjust_pack=None): """ Generate a report across a set of folds and configurations. This is a bit tricky as the the idea is that we have to acculumate per-configuration results over the folds. Here configurations are just arbitrary strings :param slices: the predictions for each configuration, for each fold. Folds should be contiguous for maximum efficiency. It may be worthwhile to generate this lazily :type slices: iterable(:py:class:`Slice`) :param adjust_pack: (optional) function that modifies a DataPack, for example by picking out a subset of the pairings. :type adjust_pack: (DataPack -> DataPack) or None """ if not mpack: raise ValueError("Can't report with empty multipack") edge_count = defaultdict(list) edge_lab_count = defaultdict(lambda: defaultdict(list)) edu_reports = defaultdict(EduReport) dpack0 = mpack.values()[0] confusion = defaultdict(lambda: empty_confusion_matrix(dpack0)) fold = None is_first_slice = True # avoid slicing the predictions if we can help it (slow) adjust_pack = adjust_pack or (lambda x: x) adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x) num_edges = {} for slc in slices: if is_first_slice and slc.fold is None: fpack = DataPack.vstack([adjust_pack(x) for x in mpack.values()]) is_first_slice = False num_edges[None] = len(fpack) elif is_first_slice or slc.fold != fold: f_mpack = select_testing(mpack, fold_dict, slc.fold) fpack = DataPack.vstack([adjust_pack(x) for x in f_mpack.values()]) fold = slc.fold num_edges[fold] = len(fpack) is_first_slice = False key = slc.configuration # accumulate scores predictions = adjust_predictions(fpack, slc.predictions) edge_count[key].append(score_edges(fpack, predictions)) edu_reports[key].add(score_edus(fpack, predictions)) confusion[key] += build_confusion_matrix(fpack, predictions) if slc.enable_details: details = score_edges_by_label(fpack, predictions) for label, lab_count in details: edge_lab_count[key][label].append(lab_count) edge_report = CombinedReport(EdgeReport, {k: EdgeReport(v) for k, v in edge_count.items()}) # combine edge_by_label_report = {} for key, counts in edge_lab_count.items(): report = CombinedReport(LabelReport, {(label,): LabelReport(vs) for label, vs in counts.items()}) edge_by_label_report[key] = report return ReportPack(edge=edge_report, edge_by_label=edge_by_label_report or None, edu=CombinedReport(EduReport, edu_reports), confusion=confusion, confusion_labels=dpack0.labels, num_edges=sum(num_edges.values()))
def for_intra(dpack, target): """Adapt a datapack to intrasentential decoding. An intrasentential datapack is almost identical to its original, except that we set the label for each ('ROOT', edu) pairing to 'ROOT' if that edu is a subgrouping head (if it has no parents other than 'ROOT' within its subgrouping). This should be done before either `for_labelling` or `for_attachment` Returns ------- dpack : DataPack target : array(int) """ # map EDUs to subgroup ids ; intra = pairs of EDUs with same subgroup id grp = {e.id: e.subgrouping for e in dpack.edus} # find all edus that have intra incoming edges (to rule out) unrelated = dpack.label_number(UNRELATED) intra_tgts = defaultdict(set) for i, (edu1, edu2) in enumerate(dpack.pairings): if (grp[edu1.id] == grp[edu2.id] and target[i] != unrelated): # edu2 has an incoming relation => not an (intra) root intra_tgts[grp[edu2.id]].add(edu2.id) # pick out the (fakeroot, edu) pairs where edu does not have # incoming intra edges all_heads = [ i for i, (edu1, edu2) in enumerate(dpack.pairings) if (edu1.id == FAKE_ROOT_ID and edu2.id not in intra_tgts[grp[edu2.id]] ) ] # NEW pick out the original inter-sentential links, for removal inter_links = [ i for i, (edu1, edu2) in enumerate(dpack.pairings) if (edu1.id != FAKE_ROOT_ID and grp[edu1.id] != grp[edu2.id] and target[i] != unrelated) ] # update datapack and target accordingly new_target = np.copy(dpack.target) new_target[all_heads] = dpack.label_number('ROOT') new_target[inter_links] = unrelated # NEW # WIP ctarget new_ctarget = {grp_name: ctgt for grp_name, ctgt in dpack.ctarget.items()} # FIXME replace each ctgt with the list of intra-sentential # RST (sub)trees # end WIP ctarget dpack = DataPack(edus=dpack.edus, pairings=dpack.pairings, data=dpack.data, target=new_target, ctarget=new_ctarget, labels=dpack.labels, vocab=dpack.vocab, graph=dpack.graph) target = np.copy(target) target[all_heads] = dpack.label_number('ROOT') target[inter_links] = unrelated # NEW return dpack, target
def fit(self, dpacks, targets): dpack = DataPack.vstack(dpacks) target = np.concatenate(targets) self._learner.fit(dpack.data, target) self._fitted = True return self