def convert_prediction(dpack, triples): """Populate a datapack prediction array from a list of triples Parameters ---------- prediction: [(string, string, string)] List of EDU id, EDU id, label triples Returns ------- dpack: DataPack A copy of the original DataPack with predictions set """ link_map = {(id1, id2): lab for id1, id2, lab in triples} def get_lbl(pair): 'from edu pair to label number' edu1, edu2 = pair key = edu1.id, edu2.id lbl = link_map.get(key, UNRELATED) return dpack.label_number(lbl) prediction = np.fromiter((get_lbl(pair) for pair in dpack.pairings), dtype=np.dtype(np.int16)) graph = Graph(prediction=prediction, attach=dpack.graph.attach, label=dpack.graph.label) return dpack.set_graph(graph)
def _classify(self, dpack, X, W, nonfixed_pairs=None): """ return predicted tree """ num_items = len(dpack) if nonfixed_pairs is None: nonfixed_pairs = np.arange(num_items) if dpack.graph is None: scores = np.zeros(num_items) label = np.zeros((num_items, len(dpack.labels))) prediction = np.empty(num_items) else: scores = np.copy(dpack.graph.attach) label = np.copy(dpack.graph.label) prediction = np.copy(dpack.graph.prediction) # compute attachment scores of all EDU pairs # TODO should this be self.decision_function? # we need to reshape, to lose 2nd dim (shape[1] == 1) of dot product scores[nonfixed_pairs] = (X[nonfixed_pairs].dot(W.T).reshape( len(nonfixed_pairs))) # dummy labelling scores and predictions (for unlabelled parsing) unk = dpack.label_number(UNKNOWN) # for every pair, set the best label to UNK # * score(lbl) = 1.0 if lbl == UNK, 0.0 otherwise label[nonfixed_pairs] = 0.0 label[nonfixed_pairs, unk] = 1.0 # * predicted label = UNK (will be overwritten by the decoder) prediction[nonfixed_pairs] = unk dpack = dpack.set_graph( Graph(prediction=prediction, attach=scores, label=label)) # call decoder dpack_pred = self.decoder.transform(dpack) edge_list = prediction_to_triples(dpack_pred) return edge_list
def _fix_intra_edges(self, dpack, spacks): """Fix intra-sentential edges for inter-sentential parsing. Scores are set to 1.0 for both attachment and labelling, for intra-sentential links. Parameters ---------- dpack : DataPack Original datapack. spacks : list of DataPack List of intra-sentential datapacks, containing intra-sentential predictions. Returns ------- dpack_copy : DataPack Copy of dpack with attachment and labelling scores updated. FIXME ----- [ ] generalize to support non-probabilistic scores """ # NB this code was moved here from SoftParser._recombine() # it probably leaves room for improvement, notably speedups unrelated_lbl = dpack.label_number(UNRELATED) sent_lbl = self._mk_get_lbl(dpack, spacks) # tweak intra-sentential attachment and labelling scores weights_a = np.copy(dpack.graph.attach) weights_l = np.copy(dpack.graph.label) for i, (edu1, edu2) in enumerate(dpack.pairings): if edu1.id == FAKE_ROOT_ID: # don't confuse the inter parser with sentence roots continue lbl = sent_lbl(i) if lbl is not None and lbl != unrelated_lbl: weights_a[i] = 1.0 weights_l[i] = np.zeros(len(dpack.labels)) weights_l[i, lbl] = 1.0 # FIXME "legacy" code that used to be in learning.oracle # it looks simpler thus better than what precedes, but is it # (partly) functionally equivalent in our pipelines? if False: # if _pass_intras: # set all intra attachments to 1.0 intra_pairs = idxes_intra(dpack, include_fake_root=False) weights_a[intra_pairs] = 1.0 # replace res with (?) # end FIXME graph = Graph(prediction=dpack.graph.prediction, attach=weights_a, label=weights_l) dpack_copy = dpack.set_graph(graph) return dpack_copy
def _classify(self, dpack, X, W): """ return predicted tree """ decoder = self.decoder num_items = len(dpack) scores = X.dot(W.T) # TODO: should this be self.decision_function? scores = scores.reshape( num_items) # lose 2nd dimension (shape[1] == 1) # unlabelled unk = dpack.label_number(UNKNOWN) label = np.zeros((num_items, len(dpack.labels))) label[:, unk] = 1.0 prediction = np.empty(num_items) prediction[:] = unk dpack = dpack.set_graph( Graph(prediction=prediction, attach=scores, label=label)) # print "SCORES:", scores graph = decoder.transform(dpack) return prediction_to_triples(graph)
def _recombine(self, dpack, spacks): "soft decoding - pass sentence edges through the prob dist" unrelated_lbl = dpack.label_number(UNRELATED) sent_lbl = self._mk_get_lbl(dpack, spacks) weights_a = np.copy(dpack.graph.attach) weights_l = np.copy(dpack.graph.label) for i, (edu1, _) in enumerate(dpack.pairings): if edu1.id == FAKE_ROOT_ID: # don't confuse the inter parser with sentence roots continue lbl = sent_lbl(i) if lbl is not None and lbl != unrelated_lbl: weights_a[i] = 1.0 weights_l[i] = np.zeros(len(dpack.labels)) weights_l[i, lbl] = 1.0 dpack = dpack.set_graph(Graph(prediction=dpack.graph.prediction, attach=weights_a, label=weights_l)) return self._parsers.inter.transform(dpack)
def multiply(dpack, attach=None, label=None): """ If the datapack is weighted, multiply its existing probabilities by the given ones, otherwise set them Parameters ---------- attach (array(float), optional) If unset will default to ones label (2D array(float), optional) If unset will default to ones Returns ------- The modified datapack """ if dpack.graph is None: if attach is None: attach = np.ones(len(dpack)) if label is None: label = np.ones((len(dpack), len(dpack.labels))) prediction = np.empty(len(dpack)) prediction[:] = dpack.label_number(UNKNOWN) else: gra = dpack.graph prediction = gra.prediction if attach is None: attach = gra.attach else: attach = np.multiply(attach, gra.attach) if label is None: label = gra.label else: label = np.multiply(label, gra.label) graph = Graph(prediction=prediction, attach=attach, label=label) return dpack.set_graph(graph)