Beispiel #1
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                self.Init()
                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, False)

                for root in forest.roots:
                    root.lstms = [self.builders[0].initial_state().add_input(root.vec),
                                  self.builders[1].initial_state().add_input(root.vec)]

                while len(forest.roots) > 1:

                    self.__evaluate(forest, False)
                    bestParent, bestChild, bestScore = None, None, float("-inf")
                    bestIndex, bestOp = None, None
                    roots = forest.roots

                    for i in xrange(len(forest.roots) - 1):
                        for irel, rel in enumerate(self.irels):
                            for op in xrange(2):
                                if bestScore < roots[i].scores[irel][op] and (i + (1 - op)) > 0:
                                    bestParent, bestChild = i + op, i + (1 - op)
                                    bestScore = roots[i].scores[irel][op]
                                    bestIndex, bestOp = i, op
                                    bestRelation, bestIRelation = rel, irel

                    for j in xrange(max(0, bestIndex - self.k - 1), min(len(forest.roots), bestIndex + self.k + 2)):
                        roots[j].scores = None

                    roots[bestChild].pred_parent_id = forest.roots[bestParent].id
                    roots[bestChild].pred_relation = bestRelation

                    roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[bestOp].add_input((self.activation(self.lstm2lstmbias + self.lstm2lstm *
                        	concatenate([roots[bestChild].lstms[0].output(), lookup(self.model["rels-lookup"], bestIRelation), roots[bestChild].lstms[1].output()]))))

                    forest.Attach(bestParent, bestChild)

                renew_cg()
                yield sentence
Beispiel #2
0
    def predict(self, sentences):
        self.getWordEmbeddings(sentences, False)

        for sentence in sentences:
            stack = ParseForest([])
            buf = ParseForest(sentence)
            for root in sentence:
                root.lstms = [root.vec for _ in range(self.nnvecs)]
            hoffset = 1 if self.headFlag else 0

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, False)
                best = max(chain(*scores), key=itemgetter(2))
                if best[1] == 2:
                    stack.roots.append(buf.roots[0])
                    del buf.roots[0]
                elif best[1] == 0:
                    child = stack.roots.pop()
                    parent = buf.roots[0]
                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]
                    bestOp = 0
                    if self.rlMostFlag:
                        parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                    if self.rlFlag:
                        parent.lstms[bestOp + hoffset] = child.vec
                elif best[1] == 1:
                    child = stack.roots.pop()
                    parent = stack.roots[-1]
                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]
                    bestOp = 1
                    if self.rlMostFlag:
                        parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                    if self.rlFlag:
                        parent.lstms[bestOp + hoffset] = child.vec
Beispiel #3
0
    def Train(self, trainData, options):
        mloss = 0.0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ninf = -float('inf')

        ts = time()
        start = ts

        random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data
        print "Length of training data: ", len(trainData)

        errs = []

        self.feature_extractor.Init(options)

        for iSentence, sentence in enumerate(trainData,1):
            if iSentence % 100 == 0:
                loss_message = 'Processing sentence number: %d'%iSentence + \
                ' Loss: %.3f'%(eloss / etotal)+ \
                ' Errors: %.3f'%((float(eerrors)) / etotal)+\
                ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
                ' Time: %.2gs'%(time()-start)
                print loss_message
                start = time()
                eerrors = 0
                eloss = 0.0
                etotal = 0
                lerrors = 0

            sentence = deepcopy(sentence) # ensures we are working with a clean copy of sentence and allows memory to be recycled each time round the loop

            conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.feature_extractor.getWordEmbeddings(conll_sentence, True, options)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)
            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                #root.lstms += [self.feature_extractor.paddingVec for _ in range(self.nnvecs - hoffset)]
                if not self.recursive_composition:
                    root.lstms += [self.feature_extractor.paddingVec for _ in range(self.nnvecs - hoffset)]
                else:
                    root.lstms += [root.vec]
                    root.lstm = None


            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, True)

                #to ensure that we have at least one wrong operation
                scores.append([(None, 4, ninf ,None)])

                stack_ids = [sitem.id for sitem in stack.roots]

                s1 = [stack.roots[-2]] if len(stack) > 1 else []
                s0 = [stack.roots[-1]] if len(stack) > 0 else []
                b = [buf.roots[0]] if len(buf) > 0 else []
                beta = buf.roots[1:] if len(buf) > 1 else []

                costs, shift_case = self.calculate_cost(scores,s0,s1,b,beta,stack_ids)

                bestValid = list(( s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == SHIFT or s[1] == SWAP or  s[0] == s0[0].relation ) ))

                bestValid = max(bestValid, key=itemgetter(2))
                bestWrong = max(( s for s in chain(*scores) if costs[s[1]] != 0 or ( s[1] != SHIFT and s[1] != SWAP and s[0] != s0[0].relation ) ), key=itemgetter(2))

                #force swap
                if costs[SWAP]== 0:
                    best = bestValid
                else:
                    #select a transition to follow
                    # + aggresive exploration
                    #1: might want to experiment with that parameter
                    if bestWrong[1] == SWAP:
                        best = bestValid
                    else:
                        best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong

                if best[1] == LEFT_ARC or best[1] ==RIGHT_ARC:
                    child = s0[0]

                #updates for the dynamic oracle
                if self.oracle:
                    self.oracle_updates(best,b,s0,stack_ids,shift_case)

                self.apply_transition(best,stack,buf,hoffset)

                if bestValid[2] < bestWrong[2] + 1.0:
                    loss = bestWrong[3] - bestValid[3]
                    mloss += 1.0 + bestWrong[2] - bestValid[2]
                    eloss += 1.0 + bestWrong[2] - bestValid[2]
                    errs.append(loss)

                #labeled errors
                if best[1] == LEFT_ARC or best[1] ==RIGHT_ARC:
                    if (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation):
                        lerrors += 1
                        #attachment error
                        if child.pred_parent_id != child.parent_id:
                            eerrors += 1

                #??? when did this happen and why?
                if best[1] == 0 or best[1] == 2:
                    etotal += 1

            #footnote 8 in Eli's original paper
            if len(errs) > 50: # or True:
                eerrs = dy.esum(errs)
                scalar_loss = eerrs.scalar_value() #forward
                eerrs.backward()
                self.trainer.update()
                errs = []
                lerrs = []

                dy.renew_cg()
                self.feature_extractor.Init(options)

        if len(errs) > 0:
            eerrs = (dy.esum(errs))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            dy.renew_cg()

        self.trainer.update()
        print "Loss: ", mloss/iSentence
        print "Total training time: %.2fs"%(time()-ts)
Beispiel #4
0
    def Predict(self, treebanks, datasplit, options):
        reached_max_swap = 0
        char_map = {}
        if options.char_map_file:
            char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
            char_map = json.loads(char_map_fh.read())
        # should probably use a namedtuple in get_vocab to make this prettier
        print "Collecting test data vocab"
        _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)
        # get external embeddings for the set of words and chars in the test vocab but not in the training vocab
        test_embeddings = defaultdict(lambda:{})
        if options.word_emb_size > 0:
            new_test_words = set(test_words) - self.feature_extractor.words.viewkeys()
            print "Number of OOV word types at test time: %i (out of %i)"%(len(new_test_words),len(test_words))
            if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for
                for lang in test_langs:
                    test_embeddings["words"].update(utils.get_external_embeddings(options,lang,new_test_words))
                if len(test_langs) > 1 and test_embeddings["words"]:
                    print "External embeddings found for %i words (out of %i)"%(len(test_embeddings["words"]),len(new_test_words))
        if options.char_emb_size > 0:
            new_test_chars = set(test_chars) - self.feature_extractor.chars.viewkeys()
            print "Number of OOV char types at test time: %i (out of %i)"%(len(new_test_chars),len(test_chars))
            if len(new_test_chars) > 0:
                for lang in test_langs:
                    test_embeddings["chars"].update(utils.get_external_embeddings(options,lang,new_test_chars,chars=True))
                if len(test_langs) > 1 and test_embeddings["chars"]:
                    print "External embeddings found for %i chars (out of %i)"%(len(test_embeddings["chars"]),len(new_test_chars))

        ts = time()
        data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
        for iSentence, osentence in enumerate(data,1):
            sentence = deepcopy(osentence)
            reached_swap_for_i_sentence = False
            max_swap = 2*len(sentence)
            iSwap = 0
            self.feature_extractor.Init(options)
            conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)

            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                if not self.recursive_composition:
                    root.lstms += [self.feature_extractor.paddingVec for _ in range(self.nnvecs - hoffset)]
                else:
                    root.lstms += [root.vec]
                    root.lstm = None #only necessary for treeLSTM case
                    root.composed_rep = root.vec.value()

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, False)
                best = max(chain(*(scores if iSwap < max_swap else scores[:3] )), key = itemgetter(2) )
                if iSwap == max_swap and not reached_swap_for_i_sentence:
                    reached_max_swap += 1
                    reached_swap_for_i_sentence = True
                    print "reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence)
                self.apply_transition(best,stack,buf,hoffset)
                if best[1] == SWAP:
                    iSwap += 1

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)]
            oconll_sentence = oconll_sentence[1:] + [oconll_sentence[0]]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
                if self.recursive_composition:
                    tok_o.composed_rep = tok.composed_rep
            yield osentence

            dy.renew_cg()

        print "Total prediction time: %.2fs"%(time()-ts)
Beispiel #5
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0
        ninf = -float('inf')

        hoffset = 1 if self.headFlag else 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                # 每处理100个句子,输出信息
                if iSentence % 100 == 0 and iSentence != 0:
                    print '处理第 ', iSentence, ' 个句子,Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) / etotal), '用时', time.time() - start
                    # logger.debug('处理第%s个句子,Loss:%s,Errors:%s,Labeled Errors:%s,用时:%s', iSentence, eloss / etotal, (float(eerrors)) / etotal, (float(lerrors) / etotal), time.time()-start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                self.getWordEmbeddings(conll_sentence, True)
                # 初始化stack为空
                stack = ParseForest([])
                # 将句子放入buf中
                buf = ParseForest(conll_sentence)

                for root in conll_sentence:
                    # 词的LSTM输入为self.nnvecs个输入向量串联
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while not (len(buf) == 1 and len(stack) == 0):
                    scores = self.__evaluate(stack, buf, True)
                    scores.append([(None, 3, ninf, None)])

                    # alpha是栈中其他元素
                    alpha = stack.roots[:-2] if len(stack) > 2 else []
                    # s1为栈顶第二个元素
                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
                    # s0为栈顶第一个元素
                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
                    # b为buffer第一个元素
                    b = [buf.roots[0]] if len(buf) > 0 else []
                    # beta是buffer中其他元素
                    beta = buf.roots[1:] if len(buf) > 1 else []

                    left_cost = (
                        len([h
                             for h in s1 + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[0]) > 0 else 1
                    right_cost = (
                        len([h for h in b + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[1]) > 0 else 1
                    shift_cost = (
                        len([h
                             for h in s1 + alpha if h.id == b[0].parent_id]) +
                        len([
                            d
                            for d in s0 + s1 + alpha if d.parent_id == b[0].id
                        ])) if len(scores[2]) > 0 else 1
                    costs = (left_cost, right_cost, shift_cost, 1)

                    bestValid = max(
                        (s for s in chain(*scores) if costs[s[1]] == 0 and (
                            s[1] == 2 or s[0] == stack.roots[-1].relation)),
                        key=itemgetter(2))
                    bestWrong = max(
                        (s for s in chain(*scores) if costs[s[1]] != 0 or (
                            s[1] != 2 and s[0] != stack.roots[-1].relation)),
                        key=itemgetter(2))
                    best = bestValid if (
                        (not self.oracle) or
                        (bestValid[2] - bestWrong[2] > 1.0) or
                        (bestValid[2] > bestWrong[2]
                         and random.random() > 0.1)) else bestWrong

                    # shift,未得到relation
                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    # left,head词是b0
                    elif best[1] == 0:
                        child = stack.roots.pop()
                        # head词是b0
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    # right,head词是s0
                    elif best[1] == 1:
                        child = stack.roots.pop()
                        # head词是s0
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    if bestValid[2] < bestWrong[2] + 1.0:
                        # 损失函数
                        loss = bestWrong[3] - bestValid[3]
                        mloss += 1.0 + bestWrong[2] - bestValid[2]
                        eloss += 1.0 + bestWrong[2] - bestValid[2]
                        errs.append(loss)

                    if best[1] != 2 and (
                            child.pred_parent_id != child.parent_id
                            or child.pred_relation != child.relation):
                        # id或者relation估计不准确,labelederror加1
                        lerrors += 1
                        if child.pred_parent_id != child.parent_id:
                            # id估计不准确,unlabelederror加1
                            errors += 1
                            eerrors += 1

                    etotal += 1

                if len(errs) > 50:  # or True:
                    #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
                    eerrs = esum(errs)
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs))  # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            # 根据损失函数求梯度
            eerrs.backward()
            # 参数更新
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        self.trainer.update()
        print "Loss: ", mloss / iSentence
Beispiel #6
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                self.Init()

                # 中文中pos为'X'的词为非语素词,需要过滤掉,否则会导致程序崩溃,英文无
                # 中文宾州树库中训练集和发展集中无这种例子,但是测试集中包含个位数的例子
                conll_sentence = [
                    entry for entry in sentence if
                    (isinstance(entry, utils.ConllEntry) and entry.pos != 'X')
                ]

                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                self.getWordEmbeddings(conll_sentence, False)
                # 初始stack为空
                stack = ParseForest([])
                # 初始buf为整个句子
                buf = ParseForest(conll_sentence)

                for root in conll_sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                # 循环直到句子中的所有依存关系都找到
                while not (len(buf) == 1 and len(stack) == 0):
                    scores = self.__evaluate(stack, buf, False)
                    best = max(chain(*scores), key=itemgetter(2))

                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                renew_cg()
                yield sentence
    def Train(self, conll_path, epoch):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0
        ninf = -float('inf')

        hoffset = 1 if self.model.headFlag else 0

        start = time.time()

        fout = open('loss_coco_0001_epoch_%d.log' % epoch, 'w')

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))

            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.model.Init()
            non_proj = 0

            for iSentence, sentence in enumerate(shuffledData):

                isProj = True
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) /
                            etotal), 'Time', time.time() - start
                    #print "check"
                    fout.write(str(eloss / etotal) + '\n')
                    start = time.time()
                    del eerrors, eloss, etotal, lerrors, ltotal
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]
                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                if iSentence != 0:
                    del sent_vec, lstms

                sent_vec = self.model.getWordEmbeddings(conll_sentence, True)
                lstms = []

                stack = ParseForest([])
                buf = ParseForest(conll_sentence)

                for i in range(len(sent_vec)):
                    buf.roots[i].lstms = i
                    lstms.append(
                        [sent_vec[i] for _ in xrange(self.model.nnvecs)])

                hoffset = 1 if self.model.headFlag else 0

                while not (len(buf) == 1 and len(stack) == 0):

                    scores = self.model.evaluate(stack, buf, True, lstms)
                    scores.append([(None, self.model.num_transitioins, ninf,
                                    None)])

                    alpha = stack.roots[:-2] if len(stack) > 2 else []
                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
                    b = [buf.roots[0]] if len(buf) > 0 else []
                    beta = buf.roots[1:] if len(buf) > 1 else []

                    left_cost = (
                        len([h
                             for h in s1 + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[0]) > 0 else 1
                    right_cost = (
                        len([h for h in b + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[1]) > 0 else 1
                    shift_cost = (
                        len([h
                             for h in s1 + alpha if h.id == b[0].parent_id]) +
                        len([
                            d
                            for d in s0 + s1 + alpha if d.parent_id == b[0].id
                        ])) if len(scores[2]) > 0 else 1
                    reduce_cost = (len([
                        h for h in s1 + b + beta if h.id == s0[0].parent_id
                    ]) + len([d for d in b + beta if d.parent_id == s0[0].id])
                                   ) if len(scores[3]) > 0 else 1

                    #print "\nWord in Buff: "
                    #for word in b+beta:
                    #    print "buf parent id:  ", word.parent_id
                    #    print "word id:  ", word.id
                    #    print "word:  ", word.form

                    if len(stack) > 0:
                        #print "parent id:  ",  s0[0].parent_id
                        if s0[0].parent_id == -1 and reduce_cost == 0:
                            left_cost += 1
                            right_cost += 1
                            shift_cost += 1

                    costs = (left_cost, right_cost, shift_cost, reduce_cost, 1)

                    #print "CCCOST   ", costs

                    #costs = (left_cost, right_cost, shift_cost, reduce_cost, isObj_cost, isPred_cost, 1)

                    try:
                        bestValid = max(
                            (s for s in chain(*scores) if costs[s[1]] == 0 and
                             (s[1] == 2 or s[0] == stack.roots[-1].relation
                              or s[0] == None)),
                            key=itemgetter(2))
                    except:
                        print "length of stack: ", len(stack.roots)
                        #for roots in stack.roots:
                        #    print roots.parent
                        #    print roots.children
                        print "This is non projective"
                        exit()
                        non_proj += 1
                        isProj = False
                        break

                    bestWrong = max(
                        (s for s in chain(*scores) if costs[s[1]] != 0 or (
                            s[1] != 2 and s[0] != stack.roots[-1].relation)),
                        key=itemgetter(2))
                    #best = bestValid if ( (not self.model.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong
                    best = bestValid
                    #print "No. %d sentences, best[1] = %d" % (iSentence, best[1])

                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.model.rlMostFlag:
                            lstms[parent.lstms][bestOp + hoffset] = lstms[
                                child.lstms][bestOp + hoffset]
                        if self.model.rlFlag:
                            lstms[parent.lstms][bestOp + hoffset] = sent_vec[
                                child.lstms]

                    elif best[1] == 1:
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.model.rlMostFlag:
                            lstms[parent.lstms][bestOp + hoffset] = lstms[
                                child.lstms][bestOp + hoffset]
                        if self.model.rlFlag:
                            lstms[parent.lstms][bestOp + hoffset] = sent_vec[
                                child.lstms]

                    elif best[1] == 3:
                        child = stack.roots.pop()
                        child.pred_parent_id = -1
                        child.pred_relation = '_'

                        bestOp = 3

                    if bestValid[2] < bestWrong[2] + 1.0:
                        loss = bestWrong[3] - bestValid[3]
                        mloss += 1.0 + bestWrong[2] - bestValid[2]
                        eloss += 1.0 + bestWrong[2] - bestValid[2]
                        errs.append(loss)

                        del loss

                    if best[1] != 2 and (
                            child.pred_parent_id != child.parent_id
                            or child.pred_relation != child.relation):
                        lerrors += 1
                        if child.pred_parent_id != child.parent_id:
                            errors += 1
                            eerrors += 1

                    etotal += 1
                    del scores

                #print "Finish %d sentences" %(iSentence)
                #print os.system('nvidia-smi')
                #exit()

                if len(errs) > 50:  # or True:
                    eerrs = torch.sum(cat(errs))
                    scalar_loss = get_data(eerrs).numpy()[0]
                    eerrs.backward()
                    self.trainer.step()
                    del eerrs
                    errs = []
                    lerrs = []

                    self.model.Init()

                self.trainer.zero_grad()

        if len(errs) > 0:
            eerrs = torch.sum(cat(errs))  # * (1.0/(float(len(errs))))
            get_data(eerrs).numpy()[0]
            eerrs.backward()
            self.trainer.step()

            del eerrs
            errs = []
            lerrs = []

        self.trainer.zero_grad()
        print "Loss: ", mloss / iSentence
Beispiel #8
0
    def Train(self, shuffledData):
        mloss = 0.0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ninf = -float('inf')

        start = time.time()

        random.shuffle(shuffledData)
        print "Length of training data: ", len(shuffledData)

        errs = []

        self.Init()

        trainData = shuffledData
        if self.debug:
            trainData = shuffledData[:200]

        for iSentence, sentence in enumerate(trainData):
            if iSentence % 100 == 0 and iSentence != 0:
                loss_message = 'Processing sentence number: %d'%iSentence + \
                ' Loss: %.3f'%(eloss / etotal)+ \
                ' Errors: %.3f'%((float(eerrors)) / etotal)+\
                ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
                ' Time: %.2gs'%(time.time()-start)
                print loss_message
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0
                lerrors = 0

            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.getWordEmbeddings(conll_sentence, True)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)
            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                root.lstms += [
                    self.paddingVec for _ in range(self.nnvecs - hoffset)
                ]
                root.relation = root.relation if root.relation in self.rels else 'runk'

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, True)

                #to ensure that we have at least one wrong operation
                scores.append([(None, 4, ninf, None)])

                stack_ids = [sitem.id for sitem in stack.roots]

                s1 = [stack.roots[-2]] if len(stack) > 1 else []
                s0 = [stack.roots[-1]] if len(stack) > 0 else []
                b = [buf.roots[0]] if len(buf) > 0 else []
                beta = buf.roots[1:] if len(buf) > 1 else []

                costs, shift_case = self.calculate_cost(
                    scores, s0, s1, b, beta, stack_ids)

                bestValid = list(
                    (s for s in chain(*scores) if costs[s[1]] == 0 and (
                        s[1] == 2 or s[1] == 3 or s[0] == s0[0].relation)))
                if len(bestValid) < 1:
                    print "===============dropping a sentence==============="
                    break

                bestValid = max(bestValid, key=itemgetter(2))
                bestWrong = max(
                    (s for s in chain(*scores) if costs[s[1]] != 0 or (
                        s[1] != 2 and s[1] != 3 and s[0] != s0[0].relation)),
                    key=itemgetter(2))

                #force swap
                if costs[3] == 0:
                    best = bestValid
                else:
                    #select a transition to follow
                    # + aggresive exploration
                    #1: might want to experiment with that parameter
                    if bestWrong[1] == 3:
                        best = bestValid
                    else:
                        best = bestValid if (
                            (not self.oracle) or
                            (bestValid[2] - bestWrong[2] > 1.0) or
                            (bestValid[2] > bestWrong[2]
                             and random.random() > 0.1)) else bestWrong

                if best[1] == 2:
                    #SHIFT
                    if shift_case == 2:
                        if b[0].parent_entry.id in stack_ids[:-1] and b[
                                0].id in b[0].parent_entry.rdeps:
                            b[0].parent_entry.rdeps.remove(b[0].id)
                        blocked_deps = [
                            d for d in b[0].rdeps if d in stack_ids
                        ]
                        for d in blocked_deps:
                            b[0].rdeps.remove(d)
                    stack.roots.append(buf.roots[0])
                    del buf.roots[0]

                elif best[1] == 3:
                    #SWAP
                    child = stack.roots.pop()
                    buf.roots.insert(1, child)

                elif best[1] == 0:
                    #LEFT-ARC
                    s0[0].rdeps = []
                    if s0[0].id in s0[0].parent_entry.rdeps:
                        s0[0].parent_entry.rdeps.remove(s0[0].id)
                    child = stack.roots.pop()
                    parent = buf.roots[0]

                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]

                elif best[1] == 1:
                    #RIGHT-ARC
                    s0[0].rdeps = []
                    if s0[0].id in s0[0].parent_entry.rdeps:
                        s0[0].parent_entry.rdeps.remove(s0[0].id)
                    child = stack.roots.pop()
                    parent = stack.roots[-1]

                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]

                #update the representation of head for attaching transitions
                if best[1] == 0 or best[1] == 1:
                    #linear order
                    if self.rlMostFlag:
                        parent.lstms[best[1] + hoffset] = child.lstms[best[1] +
                                                                      hoffset]
                    #actual children
                    if self.rlFlag:
                        parent.lstms[best[1] + hoffset] = child.vec

                if bestValid[2] < bestWrong[2] + 1.0:
                    loss = bestWrong[3] - bestValid[3]
                    mloss += 1.0 + bestWrong[2] - bestValid[2]
                    eloss += 1.0 + bestWrong[2] - bestValid[2]
                    errs.append(loss)

                #labeled errors
                if best[1] != 2 and best[1] != 3 and (
                        child.pred_parent_id != child.parent_id
                        or child.pred_relation != child.relation):
                    lerrors += 1
                    #attachment error
                    if child.pred_parent_id != child.parent_id:
                        eerrors += 1

                if best[1] == 0 or best[1] == 2:
                    etotal += 1

            #footnote 8 in Eli's original paper
            if len(errs) > 50:  # or True:
                eerrs = dy.esum(errs)
                scalar_loss = eerrs.scalar_value()  #forward
                eerrs.backward()
                self.trainer.update()
                errs = []
                lerrs = []

                dy.renew_cg()
                self.Init()

        if len(errs) > 0:
            eerrs = (dy.esum(errs))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            dy.renew_cg()

        self.trainer.update()
        print "Loss: ", mloss / iSentence
Beispiel #9
0
    def forward(self, sentences, errs):
        tmp = time.time()
        self.getWordEmbeddings(sentences, True)
        self.ebd += time.time() - tmp

        dloss, deerrors, detotal = 0, 0, 0
        for sentence in sentences:
            stack = ParseForest([])
            buf = ParseForest(sentence)
            for root in sentence:
                root.lstms = [root.vec for _ in range(self.nnvecs)]
            hoffset = 1 if self.headFlag else 0
            while not (len(buf) == 1 and len(stack) == 0):
                tmp = time.time()
                scores = self.__evaluate(stack, buf, True)
                self.evl += time.time() - tmp
                scores.append([(None, 3, -np.inf ,None)])
                alpha = stack.roots[:-2] if len(stack) > 2 else []
                s1 = [stack.roots[-2]] if len(stack) > 1 else []
                s0 = [stack.roots[-1]] if len(stack) > 0 else []
                b = [buf.roots[0]] if len(buf) > 0 else []
                beta = buf.roots[1:] if len(buf) > 1 else []
                left_cost  = (len([h for h in s1 + beta if h.id == s0[0].parent_id]) +
                              len([d for d in b + beta if d.parent_id == s0[0].id])) if len(scores[0]) > 0 else 1
                right_cost = (len([h for h in b + beta if h.id == s0[0].parent_id]) +
                              len([d for d in b + beta if d.parent_id == s0[0].id])) if len(scores[1]) > 0 else 1
                shift_cost = (len([h for h in s1 + alpha if h.id == b[0].parent_id]) +
                              len([d for d in s0 + s1 + alpha if d.parent_id == b[0].id])) if len(scores[2]) > 0 else 1
                costs = (left_cost, right_cost, shift_cost, 1)
                bestValid = max((s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or  s[0] == stack.roots[-1].relation)), key=itemgetter(2))
                bestWrong = max((s for s in chain(*scores) if costs[s[1]] != 0 or  ( s[1] != 2 and s[0] != stack.roots[-1].relation)), key=itemgetter(2))
                best = bestValid if ((not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1)) else bestWrong
                if best[1] == 2:
                    stack.roots.append(buf.roots[0])
                    del buf.roots[0]
                elif best[1] == 0:
                    child = stack.roots.pop()
                    parent = buf.roots[0]
                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]
                    bestOp = 0
                    if self.rlMostFlag:
                        parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                    if self.rlFlag:
                        parent.lstms[bestOp + hoffset] = child.vec
                elif best[1] == 1:
                    child = stack.roots.pop()
                    parent = stack.roots[-1]
                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]
                    bestOp = 1
                    if self.rlMostFlag:
                        parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                    if self.rlFlag:
                        parent.lstms[bestOp + hoffset] = child.vec
                if bestValid[2] < bestWrong[2] + 1.0:
                    loss = bestWrong[3] - bestValid[3]
                    dloss += 1.0 + bestWrong[2] - bestValid[2]
                    errs.append(loss)
                if best[1] != 2 and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation):
                    if child.pred_parent_id != child.parent_id:
                        deerrors += 1
                detotal += 1
        return dloss, deerrors, detotal
Beispiel #10
0
    def Train(self, conll_path, options):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        #eerrors = 0
        #lerrors = 0
        etotal = 0
        #ltotal = 0
        max_quotient = float("-inf")
        min_quotient = float("inf")
        NUM_SAMPLES = options.num_samples  #default 10

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            #eeloss = 0.0
            batch_errs = []

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Time', time.time(
                    ) - start
                    #print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start
                    start = time.time()
                    #eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    #lerrors = 0
                    #ltotal = 0
                sample_errs = []
                sample_quotients = []
                #print('Sentence: {}'.format(sentence))
                DEBUG = random.random() < 0.0001
                if DEBUG:
                    print("Train sentence: {}".format(
                        [e.form for e in sentence]))
                for _ in xrange(NUM_SAMPLES):

                    forest = ParseForest(sentence)
                    self.getWordEmbeddings(forest, True)

                    for root in forest.roots:
                        root.lstms = [
                            self.builders[0].initial_state().add_input(
                                root.vec),
                            self.builders[1].initial_state().add_input(
                                root.vec)
                        ]

                    unassigned = {
                        entry.id: sum([
                            1 for pentry in sentence
                            if pentry.parent_id == entry.id
                        ])
                        for entry in sentence
                    }

                    #loss = 0
                    log_q_total = 0.0
                    log_p_total = 0.0
                    while len(forest.roots) > 1:
                        self.__evaluate(
                            forest, True)  #NOTE(prkriley): this updates scores
                        roots = forest.roots

                        rootsIds = set([root.id for root in roots])

                        def _isValid(i):
                            return (unassigned[roots[i].id] == 0) and (
                                (i > 0
                                 and roots[i].parent_id == roots[i - 1].id) or
                                (i < len(roots) - 1
                                 and roots[i].parent_id == roots[i + 1].id))

                        valid_zs = [
                            j for j in xrange(1, len(roots)) if _isValid(j)
                        ]

                        z_scores = concatenate([r.zexpr for r in roots[1:]])
                        valid_z_scores = concatenate(
                            [roots[j].zexpr for j in valid_zs])
                        p_zs = softmax(z_scores)
                        #print("P(z): {}".format(p_zs.npvalue()))
                        q_temperature = 16.0
                        q_zs = softmax(valid_z_scores * 1.0 / q_temperature)
                        q_zs_numpy = q_zs.npvalue()
                        q_zs_numpy /= np.sum(q_zs_numpy)
                        if DEBUG:
                            print("Valid z indices: {}".format(valid_zs))
                            print("Q(z): {}".format(q_zs_numpy))

                        valid_i = np.random.choice(len(valid_zs), p=q_zs_numpy)
                        q_z = pick(q_zs, valid_i)
                        i = valid_zs[valid_i]
                        log_q_total += log(q_z).scalar_value()
                        p_z = pick(p_zs, i - 1)
                        log_p_total += log(p_z).scalar_value()

                        irel = list(self.irels).index(roots[i].relation)
                        op = 0 if roots[i].parent_id == roots[i - 1].id else 1
                        #TODO(prkriley): verify correctness of this index math
                        presoftmax_p_y = [
                            val for tup in roots[i].exprs for val in tup
                        ]
                        if i < len(roots) - 1:
                            neglog_p_y = pickneglogsoftmax(
                                concatenate(presoftmax_p_y), irel * 2 + op)
                        else:
                            assert op == 0
                            presoftmax_p_y = presoftmax_p_y[::2]
                            neglog_p_y = pickneglogsoftmax(
                                concatenate(presoftmax_p_y), irel)

                        neglog_p_z = pickneglogsoftmax(z_scores, i - 1)
                        errs.append(neglog_p_y + neglog_p_z)
                        log_p_total -= neglog_p_y.scalar_value()
                        mloss += neglog_p_y.scalar_value()
                        mloss += neglog_p_z.scalar_value()

                        etotal += 1

                        selectedChild = i
                        selectedIndex = i
                        selectedOp = op
                        selectedParent = i + [-1, 1][op]
                        selectedIRel = irel

                        for j in xrange(
                                max(0, selectedIndex - self.k - 2),
                                min(len(forest.roots),
                                    selectedIndex + self.k + 2)):
                            roots[j].scores = None

                        #NOTE(prkriley): counts number of real children that are still gettable
                        unassigned[roots[selectedChild].parent_id] -= 1

                        #NOTE(prkriley): I think lstms[0] is the right one, [1] is the left...
                        roots[selectedParent].lstms[selectedOp] = roots[
                            selectedParent].lstms[selectedOp].add_input(
                                self.activation(self.lstm2lstm * noise(
                                    concatenate([
                                        roots[selectedChild].lstms[0].output(),
                                        lookup(self.model["rels-lookup"],
                                               selectedIRel),
                                        roots[selectedChild].lstms[1].output()
                                    ]), 0.0) + self.lstm2lstmbias))

                        forest.Attach(selectedParent, selectedChild)

                    #END OF SINGLE SAMPLE
                    #TODO(prkriley): finalize loss, do update, etc
                    eerrs = (
                        (esum(errs)) * (1.0 / (float(len(errs))))
                    )  #TODO(prkriley): consider removing this division
                    #eerrs = esum(errs)
                    #TODO(prkriley): scale by p/q which is exp(logp-logq)
                    #print("logp: {}; logq: {}".format(log_p_total, log_q_total))
                    pq_quotient = np.exp(log_p_total - log_q_total)
                    scaled_pq_quotient = pq_quotient * 1e3
                    #scaled_pq_quotient = min(scaled_pq_quotient, 1.5e-5)
                    #scaled_pq_quotient = max(scaled_pq_quotient, 1.5e-8)
                    #eerrs *= scaled_pq_quotient
                    #print("P/Q: {}".format(pq_quotient))
                    max_quotient = max(scaled_pq_quotient, max_quotient)
                    min_quotient = min(scaled_pq_quotient, min_quotient)
                    eloss += eerrs.scalar_value()
                    sample_errs.append(eerrs)
                    sample_quotients.append(scaled_pq_quotient)
                    errs = []

                    DEBUG = False
                #END OF SAMPLING
                #upper_clip = 5e-6
                #lower_clip = 2e-8

                #scale = 1.0
                #if max_quotient < lower_clip:
                #    scale = lower_clip / max_quotient
                ###
                #SCALING QUOTIENTS

                #max_sample_quotient = max(sample_quotients)
                #if max_sample_quotient > upper_clip:
                #    scale = upper_clip / max_sample_quotient
                sum_quotients = sum(sample_quotients)
                PQ_NORMALIZE_SUM = options.pq_norm
                scale = PQ_NORMALIZE_SUM / sum_quotients
                sample_quotients = [e * scale for e in sample_quotients]

                #for q in sample_quotients:
                #    assert q <= upper_clip * 1.1, "Large quotient: {}".format(q)
                ###
                if options.use_pq:
                    sample_errs = [
                        e * q for (e, q) in zip(sample_errs, sample_quotients)
                    ]

                final_error = esum(sample_errs)
                if not options.use_pq:
                    assert len(sample_errs) == NUM_SAMPLES
                    final_error *= (1.0 / (float(len(sample_errs))))

                #TODO(prkriley): put final_error somewhere and update once we have N of them
                batch_errs.append(final_error)
                if len(batch_errs) >= options.batch_size:
                    total_error = esum(batch_errs)
                    total_error.backward()
                    self.trainer.update()
                    batch_errs = []

                    renew_cg()
                    self.Init()

                #final_error.backward()
                #self.trainer.update()

                #renew_cg()
                #self.Init()
            #END OF EPOCH
        #FILE CLOSE

        if options.use_pq:
            print("Max Quotient: {}; Min Quotient: {}".format(
                max_quotient, min_quotient))
        #self.trainer.update_epoch() #TODO(prkriley): verify that AdamTrainer handles everything this did before
        print "Loss: ", mloss / (iSentence * NUM_SAMPLES)
Beispiel #11
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                print("Sentence: {}".format([e.form for e in sentence]))
                self.Init()
                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, False)

                for root in forest.roots:
                    root.lstms = [
                        self.builders[0].initial_state().add_input(root.vec),
                        self.builders[1].initial_state().add_input(root.vec)
                    ]

                ###
                #NOTE(prkriley): looking at truth here, but ONLY for reporting
                unassigned = {
                    entry.id: sum([
                        1 for pentry in sentence
                        if pentry.parent_id == entry.id
                    ])
                    for entry in sentence
                }
                ###
                while len(forest.roots) > 1:

                    self.__evaluate(forest, False)
                    #bestParent, bestChild, bestScore = None, None, float("-inf")
                    #bestIndex, bestOp = None, None
                    roots = forest.roots

                    ###
                    z_scores = concatenate([r.zexpr for r in roots[1:]])
                    p_z = softmax(z_scores).npvalue()
                    bestIndex = np.argmax(p_z) + 1
                    print('P(z): {}'.format(p_z))
                    print('Best index: {} ({})'.format(bestIndex,
                                                       roots[bestIndex].form))
                    valid_exprs = [
                        val for tup in roots[bestIndex].exprs for val in tup
                    ]
                    if bestIndex == len(roots) - 1:
                        valid_exprs = valid_exprs[::2]
                    p_y = softmax(concatenate(valid_exprs))
                    max_y_index = np.argmax(
                        p_y.npvalue()
                    )  #NOTE(prkriley): don't need to actually do softmax just to pick max

                    if bestIndex < len(roots) - 1:
                        bestOp = max_y_index % 2
                        bestIRelation = (max_y_index - bestOp) / 2
                    else:
                        bestOp = 0
                        bestIRelation = max_y_index
                    #TODO(prkriley): make sure op is valid
                    bestChild = bestIndex
                    bestParent = bestIndex + [-1, 1][bestOp]
                    bestRelation = self.irels[bestIRelation]

                    ###
                    ###
                    #NOTE(prkriley): again, using truth but only for reporting
                    def _isValid(i):
                        return (unassigned[roots[i].id] == 0) and (
                            (i > 0 and roots[i].parent_id == roots[i - 1].id)
                            or (i < len(roots) - 1
                                and roots[i].parent_id == roots[i + 1].id))

                    valid_zs = [
                        j for j in xrange(1, len(roots)) if _isValid(j)
                    ]
                    valid_probs = [p_z[j - 1] for j in valid_zs]
                    invalid_probs = [
                        p_z[j - 1] for j in xrange(1, len(roots))
                        if j not in valid_zs
                    ]
                    avg_valid_prob = sum(valid_probs) * 1.0 / len(
                        valid_probs) if valid_probs else -1
                    avg_invalid_prob = sum(invalid_probs) * 1.0 / len(
                        invalid_probs) if invalid_probs else -1
                    print("Avg valid prob: {}/{} = {}".format(
                        sum(valid_probs), len(valid_probs), avg_valid_prob))
                    print("Avg invalid prob: {}/{} = {}".format(
                        sum(invalid_probs), len(invalid_probs),
                        avg_invalid_prob))
                    ###

                    #for j in xrange(max(0, bestIndex - self.k - 1), min(len(forest.roots), bestIndex + self.k + 2)):
                    for j in xrange(
                            max(0, bestIndex - self.k - 2),
                            min(len(forest.roots), bestIndex + self.k + 2)):
                        roots[j].scores = None

                    roots[bestChild].pred_parent_id = forest.roots[
                        bestParent].id
                    roots[bestChild].pred_relation = bestRelation

                    roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[
                        bestOp].add_input((self.activation(
                            self.lstm2lstmbias + self.lstm2lstm * concatenate([
                                roots[bestChild].lstms[0].output(),
                                lookup(self.model["rels-lookup"], bestIRelation
                                       ), roots[bestChild].lstms[1].output()
                            ]))))

                    unassigned[roots[bestChild].parent_id] -= 1
                    forest.Attach(bestParent, bestChild)

                renew_cg()
                yield sentence
Beispiel #12
0
    def Predict(self, data, prefix=None):

        reached_max_swap = 0
        get_vectors = False

        if prefix:
            pref_idx = 0
            if type(prefix) == list:
                pf = prefix[pref_idx]
            else:
                pf = prefix

            fcemb = codecs.open(pf + '-char-emb.vec', 'w', encoding='utf-8')
            fwemb = codecs.open(pf + '-word-emb.vec', 'w', encoding='utf-8')
            fenc = codecs.open(pf + '-encoder.vec', 'w', encoding='utf-8')
            get_vectors = True
            lang_name = ''

        for iSentence, osentence in enumerate(data, 1):
            if type(prefix) == list:
                if iSentence == 1:
                    lang_name = osentence[0].language_id
                    print 'Extract feature:', pf, lang_name
                else:
                    if lang_name != osentence[0].language_id:
                        fcemb.close()
                        fwemb.close()
                        fenc.close()
                        pref_idx += 1
                        pf = prefix[pref_idx]
                        fcemb = codecs.open(pf + '-char-emb.vec',
                                            'w',
                                            encoding='utf-8')
                        fwemb = codecs.open(pf + '-word-emb.vec',
                                            'w',
                                            encoding='utf-8')
                        fenc = codecs.open(pf + '-encoder.vec',
                                           'w',
                                           encoding='utf-8')
                        lang_name = osentence[0].language_id
                        print 'Extract feature:', pf, lang_name

            sentence = deepcopy(osentence)
            reached_swap_for_i_sentence = False
            max_swap = 2 * len(sentence)
            iSwap = 0

            self.feature_extractor.Init()
            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]

            data_vec = self.feature_extractor.getWordEmbeddings(
                conll_sentence, False, get_vectors=get_vectors)

            if get_vectors:
                for dat in data_vec:
                    # flabel.write(str(dat[0]) + '\t' + dat[1] + '\t' + dat[2] + '\t' + dat[3] + '\n')
                    fcemb.write(','.join([str(x) for x in dat[4]]) + '\n')
                    fwemb.write(','.join([str(x) for x in dat[5]]) + '\n')
                    fenc.write(','.join([str(x) for x in dat[6]]) + '\n')

            stack = ParseForest([])
            buf = ParseForest(conll_sentence)

            hoffset = 1 if self.headFlag else 0

            lang = conll_sentence[1].language_id
            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                if not self.feature_extractor.multiling or self.feature_extractor.shareWordLookup:
                    root.lstms += [
                        self.feature_extractor.paddingVec
                        for _ in range(self.nnvecs - hoffset)
                    ]
                else:
                    root.lstms += [
                        self.feature_extractor.paddingVecs[lang]
                        for _ in range(self.nnvecs - hoffset)
                    ]

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, False)
                best = max(
                    chain(*(scores if iSwap < max_swap else scores[:3])),
                    key=itemgetter(2))
                if iSwap == max_swap and not reached_swap_for_i_sentence:
                    reached_max_swap += 1
                    reached_swap_for_i_sentence = True
                    print "reached max swap in %d out of %d sentences" % (
                        reached_max_swap, iSentence)
                self.apply_transition(best, stack, buf, hoffset)
                if best[1] == 3:
                    iSwap += 1

            dy.renew_cg()

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [
                entry for entry in osentence
                if isinstance(entry, utils.ConllEntry)
            ]
            oconll_sentence = oconll_sentence[1:] + [oconll_sentence[0]]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
            yield osentence

        if prefix:
            fcemb.close()
            fwemb.close()
            fenc.close()
Beispiel #13
0
    def Train(self, conll_path, epoch):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0
        ninf = -float('inf')

        hoffset = 1 if self.headFlag else 0

        start1 = start = time.time()
	onlyNonProjectives = True
        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, onlyNonProjectives))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

	    numOfSent = len(shuffledData)
	    displayFreq = 500
	    if numOfSent < 2000: displayFreq = 200
            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % displayFreq == 0 and iSentence != 0:
                    #print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start
		    timeSpent = time.time()-start
		    totalTimeSpent = time.time()-start1
		    timeToGo = totalTimeSpent*(numOfSent-iSentence) / iSentence
		    print 'Epoch: %2d sentence number: %6d/%d Loss: %.5f Errors: %.5f Labeled Errors: %.5f Time: %.1f s, total: %.1f s ETA: %.1f s' \
			% (epoch+1,
			   iSentence,
                           numOfSent,
			   (eloss / etotal),
			   (float(eerrors) / etotal),
			   (float(lerrors) / etotal),
			   timeSpent, totalTimeSpent,
			   timeToGo
			  )
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0
              
		
		if self.use_root:
			# garder le noeud "*root*" rajoute par read_conll (peut causer plusieurs racines)
                	sentence = sentence[1:] + [sentence[0]]
		else:
			sentence = sentence[1:]

                self.getWordEmbeddings(sentence, True)
                stack = ParseForest([])
                buf = ParseForest(sentence)

                for root in sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while len(buf) > 0 or len(stack) > 1 :
                    scores = self.__evaluate(stack, buf, True)
                    scores.append([(None, 3, ninf ,None)])

                    alpha = stack.roots[:-2] if len(stack) > 2 else []
                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
                    b = [buf.roots[0]] if len(buf) > 0 else []
                    beta = buf.roots[1:] if len(buf) > 1 else []

                    left_cost  = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) + 
                                   len([d for d in b + beta if d.parent_id == s0[0].id]) )  if len(scores[0]) > 0 else 1

                    right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) +
                                   len([d for d in b + beta if d.parent_id == s0[0].id]) )  if len(scores[1]) > 0 else 1

                    shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) +
                                   len([d for d in s0 + s1 + alpha if d.parent_id == b[0].id]) )  if len(scores[2]) > 0 else 1

                    costs = (left_cost, right_cost, shift_cost, 1)
		    bestOK = True
		    try:
	            	bestValid = max(( s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or  s[0] == stack.roots[-1].relation ) ), key=itemgetter(2))
#			print "best",bestValid
		    except:
			bestOK = False


		    try:
			bestWrong = max(( s for s in chain(*scores) if costs[s[1]] != 0 or  ( s[1] != 2 and s[0] != stack.roots[-1].relation ) ), key=itemgetter(2))
		    except:
			print "wrong", bestWrong
			bestOK = False


		    # bestValid or bastWrong may fail when chain(*scores) gives an empty list
		    # in this (rare) case we keep the last best
		    # Will crash if the first word has an empty list
		    if bestOK:
			best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong

		 
                    if best[1] == 2:
			# we learned a SHIFT
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
			# we learnded a LEFT ARC
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
			# RIGHT ARC
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    if bestValid[2] < bestWrong[2] + 1.0:
                        loss = bestWrong[3] - bestValid[3]
                        mloss += 1.0 + bestWrong[2] - bestValid[2]
                        eloss += 1.0 + bestWrong[2] - bestValid[2]
                        errs.append(loss)

                    if best[1] != 2 and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation):
                        lerrors += 1
                        if child.pred_parent_id != child.parent_id:
                            errors += 1
                            eerrors += 1

                    etotal += 1

                if len(errs) > 50: # or True:
		    #print "too many errors"
                    #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
                    eerrs = esum(errs)
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs)) # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: %.4f time spent in epoch %.1f min" % (mloss/iSentence, (time.time()-start1)/60)
Beispiel #14
0
    def Predict(self, conll_path, is_string=False):
	conllFP = None
	#OLD=False
	#self.use_root=False
	if is_string:
		conllFP = StringIO.StringIO(conll_path)
	else:
		conllFP = open(conll_path, 'r')
        #with open(conll_path, 'r') as conllFP:
	
	if conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                self.Init()
		
		if self.use_root:
			# garder le noeud "*root*" (peut causer plusieurs racines)
                	sentence = sentence[1:] + [sentence[0]]
		else:
			sentence = sentence[1:]
		#print "aaaa ", sentence[0], type(sentence)
                self.getWordEmbeddings(sentence, False)
                stack = ParseForest([])
                buf = ParseForest(sentence)

                for root in sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

		cttrans = 0
		#print "\n====="
                while len(buf) > 0 or len(stack) > 1 :
                    scores = self.__evaluate(stack, buf, False)
                    best = max(chain(*scores), key = itemgetter(2) )

		    #print "\nBUFFER: ", buf
		    #print "STACK:  ", stack
		    #print scores

		    cttrans += 1
		    # transitions
                    if best[1] == 2:
			# SHIFT
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]
			#print cttrans, "SHIFT"

                    elif best[1] == 0:
			# LEFT (?) ARC
			#print cttrans, "LEFT"
                        child = stack.roots.pop()
                        parent = buf.roots[0]
			
                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

			#print child, child.form, child.pred_parent_id 

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
			# RIGHT ARC
			#print cttrans, "RIGHT"
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                renew_cg()
		if self.use_root:
                	yield [sentence[-1]] + sentence[:-1]
		else:
			# write_conll coupe le premier mot, il faut mettre qq chose ici
			yield [sentence[-1]] + sentence
Beispiel #15
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) /
                            etotal), 'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, True)

                for root in forest.roots:
                    root.lstms = [
                        self.builders[0].initial_state().add_input(root.vec),
                        self.builders[1].initial_state().add_input(root.vec)
                    ]

                unassigned = {
                    entry.id: sum([
                        1 for pentry in sentence
                        if pentry.parent_id == entry.id
                    ])
                    for entry in sentence
                }

                while len(forest.roots) > 1:
                    self.__evaluate(forest, True)
                    bestValidOp, bestValidScore = None, float("-inf")
                    bestWrongOp, bestWrongScore = None, float("-inf")

                    bestValidParent, bestValidChild = None, None
                    bestValidIndex, bestWrongIndex = None, None
                    roots = forest.roots

                    rootsIds = set([root.id for root in roots])

                    for i in xrange(len(forest.roots) - 1):
                        for irel, rel in enumerate(self.irels):
                            for op in xrange(2):
                                child = i + (1 - op)
                                parent = i + op

                                oracleCost = unassigned[roots[child].id] + (
                                    0 if roots[child].parent_id not in rootsIds
                                    or roots[child].parent_id
                                    == roots[parent].id else 1)

                                if oracleCost == 0 and (
                                        roots[child].parent_id !=
                                        roots[parent].id
                                        or roots[child].relation == rel):
                                    if bestValidScore < forest.roots[i].scores[
                                            irel][op]:
                                        bestValidScore = forest.roots[
                                            i].scores[irel][op]
                                        bestValidOp = op
                                        bestValidParent, bestValidChild = parent, child
                                        bestValidIndex = i
                                        bestValidIRel, bestValidRel = irel, rel
                                        bestValidExpr = roots[
                                            bestValidIndex].exprs[
                                                bestValidIRel][bestValidOp]
                                elif bestWrongScore < forest.roots[i].scores[
                                        irel][op]:
                                    bestWrongScore = forest.roots[i].scores[
                                        irel][op]
                                    bestWrongParent, bestWrongChild = parent, child
                                    bestWrongOp = op
                                    bestWrongIndex = i
                                    bestWrongIRel, bestWrongRel = irel, rel
                                    bestWrongExpr = roots[
                                        bestWrongIndex].exprs[bestWrongIRel][
                                            bestWrongOp]

                    if bestValidScore < bestWrongScore + 1.0:
                        loss = bestWrongExpr - bestValidExpr
                        mloss += 1.0 + bestWrongScore - bestValidScore
                        eloss += 1.0 + bestWrongScore - bestValidScore
                        errs.append(loss)

                    if not self.oracle or bestValidScore - bestWrongScore > 1.0 or (
                            bestValidScore > bestWrongScore
                            and random.random() > 0.1):
                        selectedOp = bestValidOp
                        selectedParent = bestValidParent
                        selectedChild = bestValidChild
                        selectedIndex = bestValidIndex
                        selectedIRel, selectedRel = bestValidIRel, bestValidRel
                    else:
                        selectedOp = bestWrongOp
                        selectedParent = bestWrongParent
                        selectedChild = bestWrongChild
                        selectedIndex = bestWrongIndex
                        selectedIRel, selectedRel = bestWrongIRel, bestWrongRel

                    if roots[selectedChild].parent_id != roots[
                            selectedParent].id or selectedRel != roots[
                                selectedChild].relation:
                        lerrors += 1
                        if roots[selectedChild].parent_id != roots[
                                selectedParent].id:
                            errors += 1
                            eerrors += 1

                    etotal += 1

                    for j in xrange(
                            max(0, selectedIndex - self.k - 1),
                            min(len(forest.roots),
                                selectedIndex + self.k + 2)):
                        roots[j].scores = None

                    unassigned[roots[selectedChild].parent_id] -= 1

                    roots[selectedParent].lstms[selectedOp] = roots[
                        selectedParent].lstms[selectedOp].add_input(
                            self.activation(self.lstm2lstm * noise(
                                concatenate([
                                    roots[selectedChild].lstms[0].output(),
                                    lookup(self.model["rels-lookup"],
                                           selectedIRel),
                                    roots[selectedChild].lstms[1].output()
                                ]), 0.0) + self.lstm2lstmbias))

                    forest.Attach(selectedParent, selectedChild)

                if len(errs) > 50.0:
                    eerrs = ((esum(errs)) * (1.0 / (float(len(errs)))))
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs)) * (1.0 / (float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss / iSentence
Beispiel #16
0
    def Train(self, trainData):
        mloss = 0.0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ninf = -float('inf')

        beg = time.time()
        start = time.time()

        random.shuffle(
            trainData
        )  # in certain cases the data will already have been shuffled after being read from file or while creating dev data
        print "Length of training data: ", len(trainData)

        errs = []

        self.feature_extractor.Init()

        for iSentence, sentence in enumerate(trainData, 1):
            if iSentence % 100 == 0:
                loss_message = 'Processing sentence number: %d'%iSentence + \
                ' Loss: %.3f'%(eloss / etotal)+ \
                ' Errors: %.3f'%((float(eerrors)) / etotal)+\
                ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
                ' Time: %.2gs'%(time.time()-start)
                print loss_message
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0
                lerrors = 0

            sentence = deepcopy(
                sentence
            )  # ensures we are working with a clean copy of sentence and allows memory to be recycled each time round the loop

            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.feature_extractor.getWordEmbeddings(conll_sentence, True)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)
            hoffset = 1 if self.headFlag else 0
            lang = conll_sentence[1].language_id

            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                if not self.feature_extractor.multiling or self.feature_extractor.shareWordLookup:
                    root.lstms += [
                        self.feature_extractor.paddingVec
                        for _ in range(self.nnvecs - hoffset)
                    ]
                else:
                    root.lstms += [
                        self.feature_extractor.paddingVecs[lang]
                        for _ in range(self.nnvecs - hoffset)
                    ]

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, True)

                #to ensure that we have at least one wrong operation
                scores.append([(None, 4, ninf, None)])

                stack_ids = [sitem.id for sitem in stack.roots]

                s1 = [stack.roots[-2]] if len(stack) > 1 else []
                s0 = [stack.roots[-1]] if len(stack) > 0 else []
                b = [buf.roots[0]] if len(buf) > 0 else []
                beta = buf.roots[1:] if len(buf) > 1 else []

                costs, shift_case = self.calculate_cost(
                    scores, s0, s1, b, beta, stack_ids)

                bestValid = list(
                    (s for s in chain(*scores) if costs[s[1]] == 0 and (
                        s[1] == 2 or s[1] == 3 or s[0] == s0[0].relation)))
                if len(bestValid) < 1:
                    print "===============dropping a sentence==============="
                    break

                bestValid = max(bestValid, key=itemgetter(2))
                bestWrong = max(
                    (s for s in chain(*scores) if costs[s[1]] != 0 or (
                        s[1] != 2 and s[1] != 3 and s[0] != s0[0].relation)),
                    key=itemgetter(2))

                #force swap
                if costs[3] == 0:
                    best = bestValid
                else:
                    #select a transition to follow
                    # + aggresive exploration
                    if bestWrong[1] == 3:
                        best = bestValid
                    else:
                        best = bestValid if (
                            (not self.oracle) or
                            (bestValid[2] - bestWrong[2] > 1.0) or
                            (bestValid[2] > bestWrong[2]
                             and random.random() > 0.1)) else bestWrong

                #updates for the dynamic oracle
                if best[1] == 2:
                    #SHIFT
                    if shift_case == 2:
                        if b[0].parent_entry.id in stack_ids[:-1] and b[
                                0].id in b[0].parent_entry.rdeps:
                            b[0].parent_entry.rdeps.remove(b[0].id)
                        blocked_deps = [
                            d for d in b[0].rdeps if d in stack_ids
                        ]
                        for d in blocked_deps:
                            b[0].rdeps.remove(d)

                elif best[1] == 0 or best[1] == 1:
                    #LA or RA
                    child = s0[0]
                    s0[0].rdeps = []
                    if s0[0].id in s0[0].parent_entry.rdeps:
                        s0[0].parent_entry.rdeps.remove(s0[0].id)

                self.apply_transition(best, stack, buf, hoffset)

                if bestValid[2] < bestWrong[2] + 1.0:
                    loss = bestWrong[3] - bestValid[3]
                    mloss += 1.0 + bestWrong[2] - bestValid[2]
                    eloss += 1.0 + bestWrong[2] - bestValid[2]
                    errs.append(loss)

                #labeled errors
                if best[1] != 2 and best[1] != 3 and (
                        child.pred_parent_id != child.parent_id
                        or child.pred_relation != child.relation):
                    lerrors += 1
                    #attachment error
                    if child.pred_parent_id != child.parent_id:
                        eerrors += 1

                if best[1] == 0 or best[1] == 2:
                    etotal += 1

            #footnote 8 in Eli's original paper
            if len(errs) > 50:  # or True:
                eerrs = dy.esum(errs)
                scalar_loss = eerrs.scalar_value()  #forward
                eerrs.backward()
                self.trainer.update()
                errs = []
                lerrs = []

                dy.renew_cg()
                self.feature_extractor.Init()

        if len(errs) > 0:
            eerrs = (dy.esum(errs))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            dy.renew_cg()

        self.trainer.update()
        print "Loss: ", mloss / iSentence
        print "Total Training Time: %.2gs" % (time.time() - beg)
Beispiel #17
0
    def Predict(self, treebanks, datasplit, options):
        reached_max_swap = 0
        char_map = {}
        if options.char_map_file:
            char_map_fh = open(options.char_map_file, encoding='utf-8')
            char_map = json.loads(char_map_fh.read())
        # should probably use a namedtuple in get_vocab to make this prettier
        _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(
            treebanks, datasplit, char_map)

        # get external embeddings for the set of words and chars in the
        # test vocab but not in the training vocab
        test_embeddings = defaultdict(lambda: {})
        if options.word_emb_size > 0 and options.ext_word_emb_file:
            new_test_words = \
                set(test_words) - self.feature_extractor.words.keys()

            print("Number of OOV word types at test time: %i (out of %i)" %
                  (len(new_test_words), len(test_words)))

            if len(new_test_words) > 0:
                # no point loading embeddings if there are no words to look for
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=new_test_words)
                    test_embeddings["words"].update(embeddings)
                if len(test_langs) > 1 and test_embeddings["words"]:
                    print("External embeddings found for %i words "\
                          "(out of %i)" % \
                          (len(test_embeddings["words"]), len(new_test_words)))

        if options.char_emb_size > 0:
            new_test_chars = \
                set(test_chars) - self.feature_extractor.chars.keys()
            print("Number of OOV char types at test time: %i (out of %i)" %
                  (len(new_test_chars), len(test_chars)))

            if len(new_test_chars) > 0:
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=new_test_chars,
                        chars=True)
                    test_embeddings["chars"].update(embeddings)
                if len(test_langs) > 1 and test_embeddings["chars"]:
                    print("External embeddings found for %i chars "\
                          "(out of %i)" % \
                          (len(test_embeddings["chars"]), len(new_test_chars)))

        data = utils.read_conll_dir(treebanks, datasplit, char_map=char_map)
        for iSentence, osentence in enumerate(data, 1):
            sentence = deepcopy(osentence)
            reached_swap_for_i_sentence = False
            max_swap = 2 * len(sentence)
            iSwap = 0
            self.feature_extractor.Init(options)
            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.feature_extractor.getWordEmbeddings(conll_sentence, False,
                                                     options, test_embeddings)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)

            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                #empty = dy.zeros(2*options.lstm_output_size)
                root.lstms = [root.vec] if self.headFlag else []
                root.lstms += [root.vec for _ in range(self.nnvecs - hoffset)]
                root.relation = root.relation if root.relation in self.irels else 'runk'

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, False)
                best = max(
                    chain(*(scores if iSwap < max_swap else scores[:3])),
                    key=itemgetter(2))
                if iSwap == max_swap and not reached_swap_for_i_sentence:
                    reached_max_swap += 1
                    reached_swap_for_i_sentence = True
                    print("reached max swap in %d out of %d sentences" %
                          (reached_max_swap, iSentence))
                self.apply_transition(best, stack, buf, hoffset)
                if best[1] == SWAP:
                    iSwap += 1

            dy.renew_cg()

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [
                entry for entry in osentence
                if isinstance(entry, utils.ConllEntry)
            ]
            oconll_sentence = oconll_sentence[1:] + [oconll_sentence[0]]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
            yield osentence
Beispiel #18
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0
        ninf = -float('inf')

        hoffset = 1 if self.headFlag else 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) /
                            etotal), 'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                self.getWordEmbeddings(conll_sentence, True)
                stack = ParseForest([])
                buf = ParseForest(conll_sentence)

                for root in conll_sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while not (len(buf) == 1 and len(stack) == 0):
                    scores = self.__evaluate(stack, buf, True)
                    scores.append([(None, 3, ninf, None)])

                    alpha = stack.roots[:-2] if len(stack) > 2 else []
                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
                    b = [buf.roots[0]] if len(buf) > 0 else []
                    beta = buf.roots[1:] if len(buf) > 1 else []

                    left_cost = (
                        len([h
                             for h in s1 + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[0]) > 0 else 1
                    right_cost = (
                        len([h for h in b + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[1]) > 0 else 1
                    shift_cost = (
                        len([h
                             for h in s1 + alpha if h.id == b[0].parent_id]) +
                        len([
                            d
                            for d in s0 + s1 + alpha if d.parent_id == b[0].id
                        ])) if len(scores[2]) > 0 else 1
                    costs = (left_cost, right_cost, shift_cost, 1)

                    bestValid = max(
                        (s for s in chain(*scores) if costs[s[1]] == 0 and (
                            s[1] == 2 or s[0] == stack.roots[-1].relation)),
                        key=itemgetter(2))
                    bestWrong = max(
                        (s for s in chain(*scores) if costs[s[1]] != 0 or (
                            s[1] != 2 and s[0] != stack.roots[-1].relation)),
                        key=itemgetter(2))
                    best = bestValid if (
                        (not self.oracle) or
                        (bestValid[2] - bestWrong[2] > 1.0) or
                        (bestValid[2] > bestWrong[2]
                         and random.random() > 0.1)) else bestWrong

                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    if bestValid[2] < bestWrong[2] + 1.0:
                        loss = bestWrong[3] - bestValid[3]
                        mloss += 1.0 + bestWrong[2] - bestValid[2]
                        eloss += 1.0 + bestWrong[2] - bestValid[2]
                        errs.append(loss)

                    if best[1] != 2 and (
                            child.pred_parent_id != child.parent_id
                            or child.pred_relation != child.relation):
                        lerrors += 1
                        if child.pred_parent_id != child.parent_id:
                            errors += 1
                            eerrors += 1

                    etotal += 1

                if len(errs) > 50:  # or True:
                    #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
                    eerrs = esum(errs)
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs))  # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        # self.trainer.update_epoch()  # hanwj 6.20 . there is no any decay, so just remove it.
        # self.trainer.learning_rate /= (1 - rate_decay)
        print "Loss: ", mloss / iSentence
Beispiel #19
0
    def Predict(self, data):
        reached_max_swap = 0
        for iSentence, sentence in data:
            reached_swap_for_i_sentence = False
            max_swap = 2 * len(sentence)
            iSwap = 0
            self.Init()
            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.getWordEmbeddings(conll_sentence, False)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)

            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                root.lstms += [
                    self.paddingVec for _ in range(self.nnvecs - hoffset)
                ]
                root.relation = root.relation if root.relation in self.rels else 'runk'

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, False)
                best = max(
                    chain(*(scores if iSwap < max_swap else scores[:3])),
                    key=itemgetter(2))
                if iSwap == max_swap and not reached_swap_for_i_sentence:
                    reached_max_swap += 1
                    reached_swap_for_i_sentence = True
                    print "reached max swap in %d out of %d sentences" % (
                        reached_max_swap, iSentence)

                if best[1] == 2:
                    #SHIFT
                    stack.roots.append(buf.roots[0])
                    del buf.roots[0]

                elif best[1] == 3:
                    #SWAP
                    iSwap += 1
                    child = stack.roots.pop()
                    buf.roots.insert(1, child)

                elif best[1] == 0:
                    #LEFT-ARC
                    child = stack.roots.pop()
                    parent = buf.roots[0]

                    #predict rel and label
                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]

                elif best[1] == 1:
                    #RIGHT-ARC
                    child = stack.roots.pop()
                    parent = stack.roots[-1]

                    child.pred_parent_id = parent.id
                    child.pred_relation = best[0]

                #update the representation of head for attaching transitions
                if best[1] == 0 or best[1] == 1:
                    #linear order
                    if self.rlMostFlag:
                        parent.lstms[best[1] + hoffset] = child.lstms[best[1] +
                                                                      hoffset]
                    #actual children
                    if self.rlFlag:
                        parent.lstms[best[1] + hoffset] = child.vec

            dy.renew_cg()
            yield sentence