Esempio n. 1
0
def for_intra(dpack, target):
    """Adapt a datapack to intrasentential decoding.

    An intrasentential datapack is almost identical to its original,
    except that we set the label for each ('ROOT', edu) pairing to
    'ROOT' if that edu is a subgrouping head (if it has no parents other
    than 'ROOT' within its subgrouping).

    This should be done before either `for_labelling` or `for_attachment`

    Returns
    -------
    dpack : DataPack

    target : array(int)

    """
    # map EDUs to subgroup ids ; intra = pairs of EDUs with same subgroup id
    grp = {e.id: e.subgrouping for e in dpack.edus}
    # find all edus that have intra incoming edges (to rule out)
    unrelated = dpack.label_number(UNRELATED)
    intra_tgts = defaultdict(set)
    for i, (edu1, edu2) in enumerate(dpack.pairings):
        if (grp[edu1.id] == grp[edu2.id]
            and target[i] != unrelated):
            # edu2 has an incoming relation => not an (intra) root
            intra_tgts[grp[edu2.id]].add(edu2.id)
    # pick out the (fakeroot, edu) pairs where edu does not have
    # incoming intra edges
    all_heads = [i for i, (edu1, edu2) in enumerate(dpack.pairings)
                 if (edu1.id == FAKE_ROOT_ID
                     and edu2.id not in intra_tgts[grp[edu2.id]])]
    # NEW pick out the original inter-sentential links, for removal
    inter_links = [i for i, (edu1, edu2) in enumerate(dpack.pairings)
                   if (edu1.id != FAKE_ROOT_ID
                       and grp[edu1.id] != grp[edu2.id]
                       and target[i] != unrelated)]

    # update datapack and target accordingly
    new_target = np.copy(dpack.target)
    new_target[all_heads] = dpack.label_number('ROOT')
    new_target[inter_links] = unrelated  # NEW
    # WIP ctarget
    new_ctarget = {grp_name: ctgt
                   for grp_name, ctgt in dpack.ctarget.items()}
    # FIXME replace each ctgt with the list of intra-sentential
    # RST (sub)trees
    # end WIP ctarget
    dpack = DataPack(edus=dpack.edus,
                     pairings=dpack.pairings,
                     data=dpack.data,
                     target=new_target,
                     ctarget=new_ctarget,
                     labels=dpack.labels,
                     vocab=dpack.vocab,
                     graph=dpack.graph)
    target = np.copy(target)
    target[all_heads] = dpack.label_number('ROOT')
    target[inter_links] = unrelated  # NEW
    return dpack, target
Esempio n. 2
0
    def _dpack_1():
        "example datapack for testing"
        # pylint: disable=invalid-name
        a1 = EDU('a1', '', 0, 0, 'a', 's1')
        a2 = EDU('a2', '', 0, 0, 'a', 's1')
        a3 = EDU('a3', '', 0, 0, 'a', 's1')
        b1 = EDU('b1', '', 0, 0, 'a', 's2')
        b2 = EDU('b2', '', 0, 0, 'a', 's2')
        b3 = EDU('b3', '', 0, 0, 'a', 's2')
        # pylint: enable=invalid-name

        orig_classes = ['__UNK__', 'UNRELATED', 'ROOT', 'x']
        dpack = DataPack.load(
            edus=[a1, a2, a3, b1, b2, b3],
            pairings=[(FAKE_ROOT, a1), (FAKE_ROOT, a2), (FAKE_ROOT, a3),
                      (a1, a2), (a1, a3), (a2, a3), (a2, a1), (a3, a1),
                      (a3, a2), (FAKE_ROOT, b1), (FAKE_ROOT, b2),
                      (FAKE_ROOT, b3), (b1, b2), (b1, b3), (b2, b3), (b2, b1),
                      (b3, b1), (b3, b2), (a1, b1)],
            data=scipy.sparse.csr_matrix([[1], [1], [1], [1], [1], [1], [1],
                                          [1], [1], [1], [1], [1], [1], [1],
                                          [1], [1], [1], [1], [1]]),
            target=np.array(
                [2, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 2, 3, 1, 3, 1, 1, 1, 3]),
            ctarget=dict(),  # WIP
            labels=orig_classes,
            vocab=None)
        return dpack
Esempio n. 3
0
 def fit(self, dpacks, targets):
     dpack = DataPack.vstack(dpacks)
     target = np.concatenate(targets)
     self._learner.fit(dpack.data, target)
     self._labels = [dpack.get_label(x) for x in self._learner.classes_]
     self._fitted = True
     return self
Esempio n. 4
0
def for_intra(dpack, target):
    """Adapt a datapack to intrasentential decoding.

    An intrasenential datapack is almost identical to its original, except that
    we set the label for each ('ROOT', edu) pairing to 'ROOT' if that edu is a
    subgrouping head (if it has no parents other than than 'ROOT' within its
    subgrouping).

    This should be done before either `for_labelling` or `for_attachment`

    Returns
    -------
    dpack: DataPack
    target: array(int)
    """
    # find all edus that have intra incoming edges (to rule out)
    unrelated = dpack.label_number(UNRELATED)
    intra_tgts = defaultdict(set)
    for i, (edu1, edu2) in enumerate(dpack.pairings):
        subg = edu2.subgrouping
        if (edu1.subgrouping == subg and
                target[i] != unrelated):
            intra_tgts[subg].add(edu2.id)
    # pick out the (fakeroot, edu) pairs where edu does not have
    # incoming intra edges
    all_heads = []
    for i, (edu1, edu2) in enumerate(dpack.pairings):
        subg = edu2.subgrouping
        if (edu1.id == FAKE_ROOT_ID and
                edu2.id not in intra_tgts[subg]):
            all_heads.append(i)

    # update datapack and target accordingly
    new_target = np.copy(dpack.target)
    new_target[all_heads] = dpack.label_number('ROOT')
    dpack = DataPack(edus=dpack.edus,
                     pairings=dpack.pairings,
                     data=dpack.data,
                     target=new_target,
                     labels=dpack.labels,
                     vocab=dpack.vocab,
                     graph=dpack.graph)
    target = np.copy(target)
    target[all_heads] = dpack.label_number('ROOT')
    return dpack, target
Esempio n. 5
0
    def fit(self, dpacks, targets, nonfixed_pairs=None):
        # WIP select only the nonfixed pairs
        if nonfixed_pairs is not None:
            dpacks = [dpack.selected(nf_pairs)
                      for dpack, nf_pairs in zip(dpacks, nonfixed_pairs)]
            targets = [target[nf_pairs]
                       for target, nf_pairs in zip(targets, nonfixed_pairs)]

        dpack = DataPack.vstack(dpacks)
        target = np.concatenate(targets)
        self._learner.fit(dpack.data, target)
        self._fitted = True
        return self
Esempio n. 6
0
    def _dpack_1():
        "example datapack for testing"
        # pylint: disable=invalid-name
        a1 = EDU('a1', '', 0, 0, 'a', 's1')
        a2 = EDU('a2', '', 0, 0, 'a', 's1')
        a3 = EDU('a3', '', 0, 0, 'a', 's1')
        b1 = EDU('b1', '', 0, 0, 'a', 's2')
        b2 = EDU('b2', '', 0, 0, 'a', 's2')
        b3 = EDU('b3', '', 0, 0, 'a', 's2')
        # pylint: enable=invalid-name

        orig_classes = ['__UNK__', 'UNRELATED', 'ROOT', 'x']
        dpack = DataPack.load(edus=[a1, a2, a3,
                                    b1, b2, b3],
                              pairings=[(FAKE_ROOT, a1),
                                        (FAKE_ROOT, a2),
                                        (FAKE_ROOT, a3),
                                        (a1, a2),
                                        (a1, a3),
                                        (a2, a3),
                                        (a2, a1),
                                        (a3, a1),
                                        (a3, a2),
                                        (FAKE_ROOT, b1),
                                        (FAKE_ROOT, b2),
                                        (FAKE_ROOT, b3),
                                        (b1, b2),
                                        (b1, b3),
                                        (b2, b3),
                                        (b2, b1),
                                        (b3, b1),
                                        (b3, b2),
                                        (a1, b1)],
                              data=scipy.sparse.csr_matrix([[1], [1], [1],
                                                            [1], [1], [1],
                                                            [1], [1], [1],
                                                            [1], [1], [1],
                                                            [1], [1], [1],
                                                            [1], [1], [1],
                                                            [1]]),
                              target=np.array([2, 1, 1, 3, 1, 3, 1, 1, 1,
                                               1, 1, 2, 3, 1, 3, 1, 1, 1,
                                               3]),
                              ctarget=dict(),  # WIP
                              labels=orig_classes,
                              vocab=None)
        return dpack
Esempio n. 7
0
    def fit(self, dpacks, targets, nonfixed_pairs=None):
        # WIP select only the nonfixed pairs
        if nonfixed_pairs is not None:
            dpacks = [
                dpack.selected(nf_pairs)
                for dpack, nf_pairs in zip(dpacks, nonfixed_pairs)
            ]
            targets = [
                target[nf_pairs]
                for target, nf_pairs in zip(targets, nonfixed_pairs)
            ]

        dpack = DataPack.vstack(dpacks)
        target = np.concatenate(targets)
        self._learner.fit(dpack.data, target)
        self._fitted = True
        return self
Esempio n. 8
0
    train_packs = select_training(mpack, fold_dict, fold).values()
    parser.fit(train_packs,
               [x.target for x in train_packs])

    fold_predictions = []
    # decode each document separately
    test_pack = select_testing(mpack, fold_dict, fold)
    for onedoc, dpack in test_pack.items():
        print("decoding on file : ", onedoc, file=sys.stderr)
        dpack = parser.transform(dpack)
        prediction = prediction_to_triples(dpack)
        # print("Predictions: ", prediction)
        # record the prediction score
        scores.append(score_edges(dpack, prediction))
        # optional: save the predictions for further inspection
        fold_predictions.extend(prediction)

    # optional: write predictions for this fold
    output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold)
    print("writing: %s" % output_file, file=sys.stderr)
    write_predictions_output(DataPack.vstack(test_pack.values()),
                             fold_predictions, output_file)

report = EdgeReport(scores)

# a combined report provides scores for multiple configurations
# here, we are only using it for the single config
combined_report = CombinedReport(EdgeReport,
                                 {('maxent', 'mst'): report})
print(combined_report.table())
Esempio n. 9
0
def full_report(mpack, fold_dict, slices, metrics, adjust_pack=None):
    """Generate a report across a set of folds and configurations.

    This is a bit tricky as the idea is that we have to acculumate
    per-configuration results over the folds.

    Here configurations are just arbitrary strings.

    Parameters
    ----------
    mpack : TODO
        TODO

    fold_dict : TODO
        TODO

    slices : iterable of Slice
        Predictions for each configuration, for each fold.
        Folds should be contiguous for maximum efficiency.
        It may be worthwhile to generate this lazily.

    metrics : iterable of {'edges', 'edges_by_label', 'edus', 'cspans'}
        Set of selected metrics.
        For the RST corpus, 'cspans' should not be selected for evaluation
        on the intra, inter and root subsets until I find out how to restrict
        RSTTrees along DataPack.selected().

    adjust_pack : function from DataPack to DataPack, optional
        Function that modifies a DataPack, for example by picking out a
        subset of the pairings.


    Returns
    -------
    rpack : ReportPack
        Group of reports on a set of folds and configurations.

    TODO
    ----
    * [ ] refactor ReportPack so that it contains reports only for the
      metrics selected by the harness
    """
    if not mpack:
        raise ValueError("Can't report with empty multipack")
    edge_count = defaultdict(list)
    cspan_count = defaultdict(list)
    edge_lab_count = defaultdict(lambda: defaultdict(list))
    edu_reports = defaultdict(EduReport)
    dpack0 = mpack.values()[0]
    confusion = defaultdict(lambda: empty_confusion_matrix(dpack0))

    fold = None
    is_first_slice = True

    # avoid slicing the predictions if we can help it (slow)
    adjust_pack = adjust_pack or (lambda x: x)
    adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x)

    num_edges = {}
    for slc in slices:
        if is_first_slice and slc.fold is None:
            dpacks = [adjust_pack(x) for x in mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            is_first_slice = False
            num_edges[None] = len(fpack)
        elif is_first_slice or slc.fold != fold:
            f_mpack = select_testing(mpack, fold_dict, slc.fold)
            dpacks = [adjust_pack(x) for x in f_mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            fold = slc.fold
            num_edges[fold] = len(fpack)
            is_first_slice = False
        key = slc.configuration
        # accumulate scores
        predictions = adjust_predictions(fpack, slc.predictions)
        dpredictions = [
            adjust_predictions(dpack, slc.predictions) for dpack in dpacks
        ]
        # apply selected metrics
        # * on (dependency) edges
        if 'edges' in metrics:
            edge_count[key].append(score_edges(fpack, predictions))
        # * on constituency tree spans
        if 'cspans' in metrics:
            sc_cspans = score_cspans(dpacks, dpredictions)
            cspan_count[key].append(sc_cspans)
        # * on EDUs
        if 'edus' in metrics:
            edu_reports[key].add(score_edus(fpack, predictions))
        # * (confusion matrix)
        confusion[key] += build_confusion_matrix(fpack, predictions)

        if slc.enable_details:
            # * on edges, by label
            if 'edges_by_label' in metrics:
                details = score_edges_by_label(fpack, predictions)
                for label, lab_count in details:
                    edge_lab_count[key][label].append(lab_count)

    # build combined reports
    # * on edges
    if 'edges' in metrics:
        edge_report = CombinedReport(
            EdgeReport, {k: EdgeReport(v)
                         for k, v in edge_count.items()})
    else:
        edge_report = None

    # * on cspans
    if 'cspans' in metrics:
        cspan_report = CombinedReport(
            CSpanReport, {k: CSpanReport(v)
                          for k, v in cspan_count.items()})
    else:
        cspan_report = None

    # * on EDUs
    if 'edus' in metrics:
        edu_report = CombinedReport(EduReport, edu_reports)
    else:
        edu_report = None

    # * on edges, by label
    if 'edges_by_label' in metrics:
        edge_by_label_report = {}
        for key, counts in edge_lab_count.items():
            report = CombinedReport(LabelReport,
                                    {(label, ): LabelReport(vs)
                                     for label, vs in counts.items()})
            edge_by_label_report[key] = report
    else:
        edge_by_label_report = None

    return ReportPack(edge=edge_report,
                      cspan=cspan_report,
                      edge_by_label=edge_by_label_report,
                      edu=edu_report,
                      confusion=confusion,
                      confusion_labels=dpack0.labels,
                      num_edges=sum(num_edges.values()))
Esempio n. 10
0
    print("training ... ", file=sys.stderr)
    # learn a model for the training data for this fold
    train_packs = select_training(mpack, fold_dict, fold).values()
    parser.fit(train_packs, [x.target for x in train_packs])

    fold_predictions = []
    # decode each document separately
    test_pack = select_testing(mpack, fold_dict, fold)
    for onedoc, dpack in test_pack.items():
        print("decoding on file : ", onedoc, file=sys.stderr)
        dpack = parser.transform(dpack)
        prediction = prediction_to_triples(dpack)
        # print("Predictions: ", prediction)
        # record the prediction score
        scores.append(score_edges(dpack, prediction))
        # optional: save the predictions for further inspection
        fold_predictions.extend(prediction)

    # optional: write predictions for this fold
    output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold)
    print("writing: %s" % output_file, file=sys.stderr)
    write_predictions_output(DataPack.vstack(test_pack.values()),
                             fold_predictions, output_file)

report = EdgeReport(scores)

# a combined report provides scores for multiple configurations
# here, we are only using it for the single config
combined_report = CombinedReport(EdgeReport, {('maxent', 'mst'): report})
print(combined_report.table())
Esempio n. 11
0
def full_report(mpack, fold_dict, slices, metrics,
                adjust_pack=None):
    """Generate a report across a set of folds and configurations.

    This is a bit tricky as the idea is that we have to acculumate
    per-configuration results over the folds.

    Here configurations are just arbitrary strings.

    Parameters
    ----------
    mpack: TODO
        TODO
    fold_dict: TODO
        TODO
    slices: iterable of Slice
        Predictions for each configuration, for each fold.
        Folds should be contiguous for maximum efficiency.
        It may be worthwhile to generate this lazily.
    metrics: iterable of {'edges', 'edges_by_label', 'edus', 'cspans'}
        Set of selected metrics.
        For the RST corpus, 'cspans' should not be selected for evaluation
        on the intra, inter and root subsets until I find out how to restrict
        RSTTrees along DataPack.selected().
    adjust_pack: function from DataPack to DataPack, optional
        Function that modifies a DataPack, for example by picking out a
        subset of the pairings.


    Returns
    -------
    rpack: ReportPack
        Group of reports on a set of folds and configurations.

    TODO
    ----
    * [ ] refactor ReportPack so that it contains reports only for the
      metrics selected by the harness
    """
    if not mpack:
        raise ValueError("Can't report with empty multipack")
    edge_count = defaultdict(list)
    cspan_count = defaultdict(list)
    edge_lab_count = defaultdict(lambda: defaultdict(list))
    edu_reports = defaultdict(EduReport)
    dpack0 = mpack.values()[0]
    confusion = defaultdict(lambda: empty_confusion_matrix(dpack0))

    fold = None
    is_first_slice = True

    # avoid slicing the predictions if we can help it (slow)
    adjust_pack = adjust_pack or (lambda x: x)
    adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x)

    num_edges = {}
    for slc in slices:
        if is_first_slice and slc.fold is None:
            dpacks = [adjust_pack(x) for x in mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            is_first_slice = False
            num_edges[None] = len(fpack)
        elif is_first_slice or slc.fold != fold:
            f_mpack = select_testing(mpack, fold_dict, slc.fold)
            dpacks = [adjust_pack(x) for x in f_mpack.values()]  # WIP
            fpack = DataPack.vstack(dpacks)
            fold = slc.fold
            num_edges[fold] = len(fpack)
            is_first_slice = False
        key = slc.configuration
        # accumulate scores
        predictions = adjust_predictions(fpack, slc.predictions)
        dpredictions = [adjust_predictions(dpack, slc.predictions)
                        for dpack in dpacks]
        # apply selected metrics
        # * on (dependency) edges
        if 'edges' in metrics:
            edge_count[key].append(score_edges(fpack, predictions))
        # * on constituency tree spans
        if 'cspans' in metrics:
            sc_cspans = score_cspans(dpacks, dpredictions)
            cspan_count[key].append(sc_cspans)
        # * on EDUs
        if 'edus' in metrics:
            edu_reports[key].add(score_edus(fpack, predictions))
        # * (confusion matrix)
        confusion[key] += build_confusion_matrix(fpack, predictions)

        if slc.enable_details:
            # * on edges, by label
            if 'edges_by_label' in metrics:
                details = score_edges_by_label(fpack, predictions)
                for label, lab_count in details:
                    edge_lab_count[key][label].append(lab_count)

    # build combined reports
    # * on edges
    if 'edges' in metrics:
        edge_report = CombinedReport(EdgeReport,
                                     {k: EdgeReport(v)
                                      for k, v in edge_count.items()})
    else:
        edge_report = None

    # * on cspans
    if 'cspans' in metrics:
        cspan_report = CombinedReport(CSpanReport,
                                      {k: CSpanReport(v)
                                       for k, v in cspan_count.items()})
    else:
        cspan_report = None

    # * on EDUs
    if 'edus' in metrics:
        edu_report = CombinedReport(EduReport, edu_reports)
    else:
        edu_report = None

    # * on edges, by label
    if 'edges_by_label' in metrics:
        edge_by_label_report = {}
        for key, counts in edge_lab_count.items():
            report = CombinedReport(LabelReport,
                                    {(label,): LabelReport(vs)
                                     for label, vs in counts.items()})
            edge_by_label_report[key] = report
    else:
        edge_by_label_report = None

    return ReportPack(edge=edge_report,
                      cspan=cspan_report,
                      edge_by_label=edge_by_label_report,
                      edu=edu_report,
                      confusion=confusion,
                      confusion_labels=dpack0.labels,
                      num_edges=sum(num_edges.values()))
Esempio n. 12
0
def full_report(mpack, fold_dict, slices,
                adjust_pack=None):
    """
    Generate a report across a set of folds and configurations.

    This is a bit tricky as the the idea is that we have to acculumate
    per-configuration results over the folds.

    Here configurations are just arbitrary strings

    :param slices: the predictions for each configuration, for each fold.
                   Folds should be contiguous for maximum efficiency.
                   It may be worthwhile to generate this lazily
    :type slices: iterable(:py:class:`Slice`)

    :param adjust_pack: (optional) function that modifies a DataPack, for
                        example by picking out a subset of the pairings.
    :type adjust_pack: (DataPack -> DataPack) or None
    """
    if not mpack:
        raise ValueError("Can't report with empty multipack")
    edge_count = defaultdict(list)
    edge_lab_count = defaultdict(lambda: defaultdict(list))
    edu_reports = defaultdict(EduReport)
    dpack0 = mpack.values()[0]
    confusion = defaultdict(lambda: empty_confusion_matrix(dpack0))

    fold = None
    is_first_slice = True

    # avoid slicing the predictions if we can help it (slow)
    adjust_pack = adjust_pack or (lambda x: x)
    adjust_predictions = select_in_pack if adjust_pack else (lambda _, x: x)

    num_edges = {}
    for slc in slices:
        if is_first_slice and slc.fold is None:
            fpack = DataPack.vstack([adjust_pack(x)
                                     for x in mpack.values()])
            is_first_slice = False
            num_edges[None] = len(fpack)
        elif is_first_slice or slc.fold != fold:
            f_mpack = select_testing(mpack, fold_dict, slc.fold)
            fpack = DataPack.vstack([adjust_pack(x)
                                     for x in f_mpack.values()])
            fold = slc.fold
            num_edges[fold] = len(fpack)
            is_first_slice = False
        key = slc.configuration
        # accumulate scores
        predictions = adjust_predictions(fpack, slc.predictions)
        edge_count[key].append(score_edges(fpack, predictions))
        edu_reports[key].add(score_edus(fpack, predictions))
        confusion[key] += build_confusion_matrix(fpack, predictions)
        if slc.enable_details:
            details = score_edges_by_label(fpack, predictions)
            for label, lab_count in details:
                edge_lab_count[key][label].append(lab_count)

    edge_report = CombinedReport(EdgeReport,
                                 {k: EdgeReport(v)
                                  for k, v in edge_count.items()})
    # combine
    edge_by_label_report = {}
    for key, counts in edge_lab_count.items():
        report = CombinedReport(LabelReport,
                                {(label,): LabelReport(vs)
                                 for label, vs in counts.items()})
        edge_by_label_report[key] = report

    return ReportPack(edge=edge_report,
                      edge_by_label=edge_by_label_report or None,
                      edu=CombinedReport(EduReport, edu_reports),
                      confusion=confusion,
                      confusion_labels=dpack0.labels,
                      num_edges=sum(num_edges.values()))
Esempio n. 13
0
def for_intra(dpack, target):
    """Adapt a datapack to intrasentential decoding.

    An intrasentential datapack is almost identical to its original,
    except that we set the label for each ('ROOT', edu) pairing to
    'ROOT' if that edu is a subgrouping head (if it has no parents other
    than 'ROOT' within its subgrouping).

    This should be done before either `for_labelling` or `for_attachment`

    Returns
    -------
    dpack : DataPack

    target : array(int)

    """
    # map EDUs to subgroup ids ; intra = pairs of EDUs with same subgroup id
    grp = {e.id: e.subgrouping for e in dpack.edus}
    # find all edus that have intra incoming edges (to rule out)
    unrelated = dpack.label_number(UNRELATED)
    intra_tgts = defaultdict(set)
    for i, (edu1, edu2) in enumerate(dpack.pairings):
        if (grp[edu1.id] == grp[edu2.id] and target[i] != unrelated):
            # edu2 has an incoming relation => not an (intra) root
            intra_tgts[grp[edu2.id]].add(edu2.id)
    # pick out the (fakeroot, edu) pairs where edu does not have
    # incoming intra edges
    all_heads = [
        i for i, (edu1, edu2) in enumerate(dpack.pairings)
        if (edu1.id == FAKE_ROOT_ID and edu2.id not in intra_tgts[grp[edu2.id]]
            )
    ]
    # NEW pick out the original inter-sentential links, for removal
    inter_links = [
        i for i, (edu1, edu2) in enumerate(dpack.pairings)
        if (edu1.id != FAKE_ROOT_ID and grp[edu1.id] != grp[edu2.id]
            and target[i] != unrelated)
    ]

    # update datapack and target accordingly
    new_target = np.copy(dpack.target)
    new_target[all_heads] = dpack.label_number('ROOT')
    new_target[inter_links] = unrelated  # NEW
    # WIP ctarget
    new_ctarget = {grp_name: ctgt for grp_name, ctgt in dpack.ctarget.items()}
    # FIXME replace each ctgt with the list of intra-sentential
    # RST (sub)trees
    # end WIP ctarget
    dpack = DataPack(edus=dpack.edus,
                     pairings=dpack.pairings,
                     data=dpack.data,
                     target=new_target,
                     ctarget=new_ctarget,
                     labels=dpack.labels,
                     vocab=dpack.vocab,
                     graph=dpack.graph)
    target = np.copy(target)
    target[all_heads] = dpack.label_number('ROOT')
    target[inter_links] = unrelated  # NEW
    return dpack, target
Esempio n. 14
0
 def fit(self, dpacks, targets):
     dpack = DataPack.vstack(dpacks)
     target = np.concatenate(targets)
     self._learner.fit(dpack.data, target)
     self._fitted = True
     return self