Esempio n. 1
0
    def train(self, conll_path):
        # pylint: disable=invalid-name
        # pylint: disable=missing-docstring
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        shuffled_data = list(read_conll(conll_path))
        random.shuffle(shuffled_data)
        errs = []
        lerrs = []
        i_sentence = 0

        for sentence in shuffled_data:
            if i_sentence % 100 == 0 and i_sentence != 0:
                print('Processing sentence number:', i_sentence, 'Loss:',
                      eloss / etotal, 'Errors:', (float(eerrors)) / etotal,
                      'Time',
                      time.time() - start)
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0

            conll_sentence = [
                entry for entry in sentence if isinstance(entry, ConllEntry)
            ]

            for entry in conll_sentence:
                c = float(self.words_count.get(entry.norm, 0))
                drop_flag = (random.random() < (c / (0.25 + c)))
                wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if drop_flag else 0] \
                    if self.wdims > 0 else None
                posvec = self.plookup[int(
                    self.pos[entry.pos])] if self.pdims > 0 else None

                entry.vec = concatenate(
                    [_f for _f in [wordvec, posvec, None] if _f])

                entry.lstms = [entry.vec, entry.vec]
                entry.headfov = None
                entry.modfov = None

                entry.rheadfov = None
                entry.rmodfov = None

            if self.blstm_flag:
                lstm_forward = self.builders[0].initial_state()
                lstm_backward = self.builders[1].initial_state()

                for entry, rentry in zip(conll_sentence,
                                         reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.lstms[1] = lstm_forward.output()
                    rentry.lstms[0] = lstm_backward.output()

                if self.bibi_flag:
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)

                    blstm_forward = self.bbuilders[0].initial_state()
                    blstm_backward = self.bbuilders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.vec)
                        blstm_backward = blstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

            scores, exprs = self._evaluate(conll_sentence)
            gold = [entry.parent_id for entry in conll_sentence]
            heads = decoder.parse_proj(scores,
                                       gold if self.costaug_flag else None)

            if self.labels_flag:
                for modifier, head in enumerate(gold[1:]):
                    rscores, rexprs = self._evaluate_label(
                        conll_sentence, head, modifier + 1)
                    gold_label_ind = self.rels[conll_sentence[modifier +
                                                              1].relation]
                    wrong_label_ind = max(((label, scr)
                                           for label, scr in enumerate(rscores)
                                           if label != gold_label_ind),
                                          key=itemgetter(1))[0]
                    if rscores[gold_label_ind] < rscores[wrong_label_ind] + 1:
                        lerrs.append(rexprs[wrong_label_ind] -
                                     rexprs[gold_label_ind])

            e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
            eerrors += e
            if e > 0:
                loss = [(exprs[h][i] - exprs[g][i])
                        for i, (h, g) in enumerate(zip(heads, gold))
                        if h != g]  # * (1.0/float(e))
                eloss += e
                mloss += e
                errs.extend(loss)

            etotal += len(conll_sentence)

            if i_sentence % 1 == 0 or errs > 0 or lerrs:
                if errs or lerrs:
                    eerrs = (esum(errs + lerrs))  # * (1.0/(float(len(errs))))
                    eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                renew_cg()

            i_sentence += 1

        if errs:
            eerrs = (esum(errs + lerrs))  # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            renew_cg()

        self.trainer.update()
        print("Loss: ", mloss / i_sentence)
Esempio n. 2
0
    def train(self, conll_path):
        # pylint: disable=invalid-name
        # pylint: disable=missing-docstring
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        shuffled_data = list(read_conll(conll_path))
        random.shuffle(shuffled_data)
        errs = []
        lerrs = []
        i_sentence = 0

        for sentence in shuffled_data:
            if i_sentence % 100 == 0 and i_sentence != 0:
                print('Processing sentence number:', i_sentence, 'Loss:',
                      eloss / etotal, 'Errors:',
                      (float(eerrors)) / etotal, 'Time', time.time() - start)
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0

            conll_sentence = [entry for entry in sentence if isinstance(entry, ConllEntry)]

            for entry in conll_sentence:
                c = float(self.words_count.get(entry.norm, 0))
                drop_flag = (random.random() < (c / (0.25 + c)))
                wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if drop_flag else 0] \
                    if self.wdims > 0 else None
                posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None

                entry.vec = concatenate([_f for _f in [wordvec, posvec, None] if _f])

                entry.lstms = [entry.vec, entry.vec]
                entry.headfov = None
                entry.modfov = None

                entry.rheadfov = None
                entry.rmodfov = None

            if self.blstm_flag:
                lstm_forward = self.builders[0].initial_state()
                lstm_backward = self.builders[1].initial_state()

                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.lstms[1] = lstm_forward.output()
                    rentry.lstms[0] = lstm_backward.output()

                if self.bibi_flag:
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)

                    blstm_forward = self.bbuilders[0].initial_state()
                    blstm_backward = self.bbuilders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.vec)
                        blstm_backward = blstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

            scores, exprs = self._evaluate(conll_sentence)
            gold = [entry.parent_id for entry in conll_sentence]
            heads = decoder.parse_proj(scores, gold if self.costaug_flag else None)

            if self.labels_flag:
                for modifier, head in enumerate(gold[1:]):
                    rscores, rexprs = self._evaluate_label(conll_sentence, head, modifier + 1)
                    gold_label_ind = self.rels[conll_sentence[modifier + 1].relation]
                    wrong_label_ind = max(((label, scr) for label, scr in enumerate(rscores)
                                           if label != gold_label_ind), key=itemgetter(1))[0]
                    if rscores[gold_label_ind] < rscores[wrong_label_ind] + 1:
                        lerrs.append(rexprs[wrong_label_ind] - rexprs[gold_label_ind])

            e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
            eerrors += e
            if e > 0:
                loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in
                        enumerate(zip(heads, gold)) if h != g]  # * (1.0/float(e))
                eloss += e
                mloss += e
                errs.extend(loss)

            etotal += len(conll_sentence)

            if i_sentence % 1 == 0 or errs > 0 or lerrs:
                if errs or lerrs:
                    eerrs = (esum(errs + lerrs))  # * (1.0/(float(len(errs))))
                    eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                renew_cg()

            i_sentence += 1

        if errs:
            eerrs = (esum(errs + lerrs))  # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            renew_cg()

        self.trainer.update()
        print("Loss: ", mloss / i_sentence)
Esempio n. 3
0
    def predict(self, conll_path=None, conll=None):
        # pylint: disable=missing-docstring
        if conll is None:
            conll = read_conll(conll_path)

        for sentence in conll:
            conll_sentence = [
                entry for entry in sentence if isinstance(entry, ConllEntry)
            ]

            for entry in conll_sentence:
                wordvec = self.wlookup[int(self.vocab.get(
                    entry.norm, 0))] if self.wdims > 0 else None
                posvec = self.plookup[int(
                    self.pos[entry.pos])] if self.pdims > 0 else None
                entry.vec = concatenate(
                    [_f for _f in [wordvec, posvec, None] if _f])

                entry.lstms = [entry.vec, entry.vec]
                entry.headfov = None
                entry.modfov = None

                entry.rheadfov = None
                entry.rmodfov = None

            if self.blstm_flag:
                lstm_forward = self.builders[0].initial_state()
                lstm_backward = self.builders[1].initial_state()

                for entry, rentry in zip(conll_sentence,
                                         reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.lstms[1] = lstm_forward.output()
                    rentry.lstms[0] = lstm_backward.output()

                if self.bibi_flag:
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)

                    blstm_forward = self.bbuilders[0].initial_state()
                    blstm_backward = self.bbuilders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.vec)
                        blstm_backward = blstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

            scores, _ = self._evaluate(conll_sentence)
            heads = decoder.parse_proj(scores)

            for entry, head in zip(conll_sentence, heads):
                entry.pred_parent_id = head
                entry.pred_relation = '_'

            dump = False

            if self.labels_flag:
                for modifier, head in enumerate(heads[1:]):
                    scores, _ = self._evaluate_label(conll_sentence, head,
                                                     modifier + 1)
                    conll_sentence[modifier + 1].pred_relation = \
                        self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

            renew_cg()
            if not dump:
                yield sentence
Esempio n. 4
0
    def predict(self, conll_path=None, conll=None):
        # pylint: disable=missing-docstring
        if conll is None:
            conll = read_conll(conll_path)

        for sentence in conll:
            conll_sentence = [entry for entry in sentence if
                              isinstance(entry, ConllEntry)]

            for entry in conll_sentence:
                wordvec = self.wlookup[int(
                    self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
                posvec = self.plookup[
                    int(self.pos[entry.pos])] if self.pdims > 0 else None
                entry.vec = concatenate(
                    [_f for _f in [wordvec, posvec, None] if _f])

                entry.lstms = [entry.vec, entry.vec]
                entry.headfov = None
                entry.modfov = None

                entry.rheadfov = None
                entry.rmodfov = None

            if self.blstm_flag:
                lstm_forward = self.builders[0].initial_state()
                lstm_backward = self.builders[1].initial_state()

                for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.lstms[1] = lstm_forward.output()
                    rentry.lstms[0] = lstm_backward.output()

                if self.bibi_flag:
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)

                    blstm_forward = self.bbuilders[0].initial_state()
                    blstm_backward = self.bbuilders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.vec)
                        blstm_backward = blstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

            scores, _ = self._evaluate(conll_sentence)
            heads = decoder.parse_proj(scores)

            for entry, head in zip(conll_sentence, heads):
                entry.pred_parent_id = head
                entry.pred_relation = '_'

            dump = False

            if self.labels_flag:
                for modifier, head in enumerate(heads[1:]):
                    scores, _ = self._evaluate_label(conll_sentence, head, modifier + 1)
                    conll_sentence[modifier + 1].pred_relation = \
                        self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

            renew_cg()
            if not dump:
                yield sentence