def createDebugData(self,treebank,options):
     ext = '.conllu' if options.conllu else '.conll'
     print 'Creating smaller data sets for debugging'
     if not options.predict:
         train_data = list(utils.read_conll(treebank.trainfile,maxSize=options.debug_train_sents,hard_lim=True))
         train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file
         utils.write_conll(train_file,train_data) # write the new dev data to file
         treebank.trainfile = train_file
         if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev:
             dev_data = list(utils.read_conll(treebank.devfile,maxSize=options.debug_dev_sents,hard_lim=True))
             dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file
             utils.write_conll(dev_file,dev_data) # write the new dev data to file
             # have to create a separate debug gold file if not the same as input file
             if treebank.dev_gold != treebank.devfile:
                 dev_gold_data = list(utils.read_conll(treebank.dev_gold,maxSize=options.debug_dev_sents,hard_lim=True))
                 dev_gold_file = os.path.join(treebank.outdir,'dev-gold-debug' + ext) # location for the new dev file
                 utils.write_conll(dev_gold_file,dev_gold_data) # write the new dev gold data to file
                 treebank.dev_gold = dev_gold_file
             else:
                 treebank.dev_gold = dev_file
             treebank.devfile = dev_file # important to do this last
     else:
         test_data = list(utils.read_conll(treebank.testfile,maxSize=options.debug_test_sents,hard_lim=True))
         test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file
         utils.write_conll(test_file,test_data) # write the new dev data to file
         if treebank.test_gold != treebank.testfile:
             test_gold_data = list(utils.read_conll(treebank.test_gold,maxSize=options.debug_test_sents,hard_lim=True))
             test_gold_file = os.path.join(treebank.outdir,'test-gold-debug' + ext) # location for the new dev file
             utils.write_conll(test_gold_file,test_gold_data) # write the new dev data to file
             treebank.test_gold = test_gold_file
         else:
             treebank.test_gold = test_file
         treebank.testfile = test_file
def readdata(src_corpus_name, tgt_corpus_name, shuffle=True, seed=34):
    """ Read in src and tgt data, and only shuffle the target data.

    """
    src_reader = utils.read_conll(src_corpus_name)
    tgt_reader = utils.read_conll(tgt_corpus_name)
    src_data, tgt_data = list(src_reader), list(tgt_reader)
    if shuffle:  # Only shuffle tgt.
        tgt_data = shuffle_data(tgt_data, seed)

    return src_data, tgt_data
def write_new_split(corpus_name, test_size, filedir, filename,
                    seed = 42, max_count = 2):
    """ Do stratified random sampling for a given corpus given by corpus_name,
    and save the results in file filename in directory filedir. Parameter
    test_size indicates the number of sentences to be used in the test set.

    For information on the parameter max_count, see the documentation for
    function stratified_split.

    For now, this only supports stratified_split at the sentence level.

    >>> TRAIN, TEST = write_new_split('CADEC', 1000, filedir, 'cadec', max_count = 2)
    >>> TRAIN, TEST = write_new_split('re3d', 200, filedir, 're3d', max_count = 2)
    >>> TRAIN, TEST = write_new_split('GUM', 1000, filedir, 'gum', max_count = 2)
    >>> TRAIN, TEST = write_new_split('MUC6', 1000, filedir, 'muc6', max_count = 2)
    >>> TRAIN, TEST = write_new_split('NIST_IEER99', 690, filedir, 'nist', max_count = 2)
    >>> TRAIN, TEST = write_new_split('BBN', 10000, filedir, 'bbn', max_count = 2)
    >>> TRAIN, TEST = write_new_split('GMB1', 1000, filedir, 'gmb1', max_count = 2)

    """
    r = utils.read_conll(corpus_name)
    sentences = list(r)
    train_data, test_data = stratified_split(sentences,
                                             test_size,
                                             seed = seed,
                                             max_count = max_count)

    writefile(train_data, os.path.join(filedir,'train'), filename+'-train.conll')
    writefile(test_data, os.path.join(filedir,'test'), filename+'-test.conll')

    return train_data, test_data
Exemple #4
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                self.Init()

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                self.getWordEmbeddings(conll_sentence, False)
                stack = ParseForest([])
                buf = ParseForest(conll_sentence)

                for root in conll_sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while not (len(buf) == 1 and len(stack) == 0):
                    scores = self.__evaluate(stack, buf, False)
                    best = max(chain(*scores), key=itemgetter(2))

                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                renew_cg()
                yield sentence
Exemple #5
0
def train(args):

    with tf.Graph().as_default(), create_session(args.use_xla) as session:
        vocab, tags, relations = utils.extract_vocab(args.train_file)

        with open(args.train_file) as f:
            sentences, trees = utils.read_conll(f, vocab, tags, relations,
                                                True)

        with tf.device(device_placement(args)):
            m = model.Model(args.embedding_size,
                            args.hidden_layer_size,
                            vocab,
                            tags,
                            relations,
                            session,
                            activation=args.activation,
                            l2_weight=args.l2,
                            learning_rate=args.learning_rate)

            init = tf.global_variables_initializer()

        session.run(init)

        m.train(trees,
                batch_size=args.batch_size,
                epochs=args.epochs,
                dropout_keep_prob=args.dropout_keep_prob)

        m.save_to(args.save_to)
 def prepareDev(self,treebank,options):
     treebank.pred_dev = options.pred_dev # even if options.pred_dev is True, might change treebank.pred_dev to False later if no dev data available
     if not treebank.devfile or not os.path.exists(treebank.devfile):
         if options.create_dev: # create some dev data from the training data
             train_data = list(utils.read_conll(treebank.trainfile))
             tot_sen = len(train_data)
             if tot_sen > options.min_train_sents: # need to have at least min_train_sents to move forward
                 dev_file = os.path.join(treebank.outdir,'dev-split' + '.conllu') # location for the new dev file
                 train_file = os.path.join(treebank.outdir,'train-split' + '.conllu') # location for the new train file
                 dev_len = int(0.01*options.dev_percent*tot_sen)
                 print ("Taking " + str(dev_len) + " of " + str(tot_sen)
                         + " sentences from training data as new dev data for " + treebank.name)
                 random.shuffle(train_data)
                 dev_data = train_data[:dev_len]
                 utils.write_conll(dev_file,dev_data) # write the new dev data to file
                 train_data = train_data[dev_len:] # put the rest of the training data in a new file too
                 utils.write_conll(train_file,train_data)
                 # update some variables with the new file locations
                 treebank.dev_gold = dev_file
                 treebank.devfile = dev_file
                 treebank.trainfile = train_file
             else: # not enough sentences
                 print ("Warning: not enough sentences in training data to create dev set for "
                     + treebank.name + " (minimum required --min-train-size: " + str(options.min_train_sents) + ")")
                 treebank.pred_dev = False
         else: # option --create-dev not set
             print ("Warning: No dev data for " + treebank.name
                     + ", consider adding option --create-dev to create dev data from training set")
             treebank.pred_dev = False
     if options.model_selection and not treebank.pred_dev:
         print "Warning: can't do model selection for " + treebank.name + " as prediction on dev data is off"
Exemple #7
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP)):
                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None
                    entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                heads = decoder.parse_proj(scores)

                for entry, head in zip(conll_sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                        conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
Exemple #8
0
def ensemble(files, outfile):
    """
    Takes conllu files as input
    """
    conllu_files = []
    for f in files:
        cf = utils.read_conll(f)
        conllu_files.append(cf)
    zipped_sentences = zip(*conllu_files)
    decoder = DependencyDecoder()
    sentences_out = []
    for zipped_sentence in zipped_sentences:
        conll_sentence = [
            entry for entry in zipped_sentence[0]
            if isinstance(entry, utils.ConllEntry)
        ]
        n_words = len(conll_sentence)
        m = np.zeros((n_words, n_words))
        for i_sentence in zipped_sentence:
            conll_sen = [
                entry for entry in i_sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            for item in conll_sen:
                head = item.parent_id
                dep = item.id
                m[head, dep] += 1

        #NOTE: this takes the label of the first!
        heads = decoder.parse_nonproj(m)
        for entry in zipped_sentence[0]:
            if isinstance(entry, utils.ConllEntry):
                entry.pred_parent_id = heads[entry.id]
        sentences_out.append(zipped_sentence[0])
    utils.write_conll(outfile, sentences_out)
Exemple #9
0
 def build_dataset(self, filename, skip_mwt):
     """Reads an input CoNLL-U file and returns a list of ConlluToken objects for each token in a sentence."""
     print("Building dataset using {}".format(filename))
     print("Skipping MWTs: {}".format(skip_mwt))
     annotated_sentences, comment_lines = read_conll(filename, skip_mwt)
     vocab = buildVocab(annotated_sentences, cutoff=1)
     return annotated_sentences, vocab, comment_lines
def get_embeddings():
    """ Obtain and trim the word embeddings in both the source and target
    datasets.

    This will try to use the vocabularies of: 'GUM', 're3d', 'BBN', 'i2b2-14',
    'BBN','i2b2-14', 'i2b2-06', 'CADEC', 'TwitterRitter', 'MITRestaurantCorpus',
    'MITMovieCorpus-trivia10k13', 'MUC6', 'NIST_IEER99', 'GMB1', as well as
    for CONLL 2003.

    If a dataset is not found, it is skipped over.

    """
    embeddingsPath = 'word_embeddings/glove.6B.' + WVDIM + 'd.txt.gz'

    print("Getting vocab from various datasets...")

    dnames = [
        'GUM', 're3d', 'BBN', 'i2b2-14', 'i2b2-06', 'CADEC', 'TwitterRitter',
        'MITRestaurantCorpus', 'MITMovieCorpus-trivia10k13', 'MUC6',
        'NIST_IEER99', 'GMB1'
    ]

    try:
        conll03 = list(utils.read_conll('CONLL03'))
    except:
        raise ValueError("Could not find CONLL 2003 dataset.")

    aggregation = []
    for dname in dnames:
        try:
            dataset = list(utils.read_conll(dname))
            aggregation.extend(dataset)
        except:
            print(dname + " could not be found.")

    aggregation = conll03 + aggregation

    words = lc.get_word2idx2(aggregation)
    max_len = lc.get_maxlen(aggregation)

    # NOTE: max_len was 253 for our experiments.

    print("Getting word embeddings...")
    we, w2i = embedding_utils.get_word_embeddings(embeddingsPath, words)

    return max_len, we, w2i, words
Exemple #11
0
    def train(self, conll_path):
        print('pytorch version:', torch.__version__)
        batch = 1
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        iSentence = 0
        start = time.time()
        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)
            for iSentence, sentence in enumerate(shuffledData):
                # print("Initializing hidden and cell states values to 0")
                self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [
                    self.model.init_hidden(self.model.ldims) for _ in range(4)
                ]
                # if iSentence == 0:
                #     print('hidLayerFOM values on first iteration within an epoch')
                #     print(self.model.hidLayerFOM)
                if iSentence % 100 == 0 and iSentence != 0:
                    print('Processing sentence number:', iSentence, 'eloss:',
                          eloss, 'etotal:',
                          etotal, 'Loss:', eloss / etotal, 'eerrors:',
                          float(eerrors), 'Errors:', (float(eerrors)) / etotal,
                          'Time',
                          time.time() - start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    # print('hidLayerFOM values:')
                    # print(self.model.hidLayerFOM)

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                e_output, errs, lerrs = self.model.forward(conll_sentence)
                eerrors += e_output
                eloss += e_output
                mloss += e_output
                etotal += len(sentence)
                if iSentence % batch == 0 or len(errs) > 0 or len(lerrs) > 0:
                    if len(errs) > 0 or len(lerrs) > 0:
                        reshaped_lerrs = [item.reshape(1) for item in lerrs]
                        l_variable = errs + reshaped_lerrs
                        eerrs_sum = torch.sum(concatenate_tensors(
                            l_variable))  # This result is a 1d-tensor
                        eerrs_sum.backward(
                        )  # automatically calculates gradient (backpropagation)
                        # self.print_model_parameters()
                        self.trainer.step(
                        )  # optimizer.step to update weights(to see uncomment print_model_parameters)
                        # self.print_model_parameters()
                self.trainer.zero_grad()
        print("Loss: ", mloss / iSentence)
def main():
    MSTParserLSTM.rnn_mlp = rnn_mlp
    MSTParserLSTM.bilinear = bilinear
    MSTParserLSTM.build_graph = build_graph

    parser = OptionParser()
    parser.add_option("--extrn",
                      dest="external_embedding",
                      help="External embeddings",
                      metavar="FILE",
                      default="../data/sskip.100.vectors.gz")
    parser.add_option("--params",
                      dest="params",
                      help="Parameters file",
                      metavar="FILE",
                      default="../models/params.pickle")
    parser.add_option("--model",
                      dest="model",
                      help="Load/Save model file",
                      metavar="FILE",
                      default="../models/model-135")
    parser.add_option("--test",
                      dest="conll_test",
                      help="Annotated CONLL test file",
                      metavar="FILE",
                      default=None)

    parser.add_option("--repl-words",
                      dest="repl_words",
                      help="Words at probing position",
                      metavar="FILE",
                      default="../probe/repl.words")
    parser.add_option("--output-dir",
                      dest="output_dir",
                      help="Output directory",
                      default="../probe/1")
    parser.add_option("--probe-index", type="int", dest="probe_idx", default=0)

    (options, args) = parser.parse_args()

    with open(options.params, 'r') as paramsfp:
        w2i, pos, rels, chars, stored_opt = pkl.load(paramsfp)
    stored_opt.external_embedding = options.external_embedding
    mstParser = MSTParserLSTM(pos, rels, w2i, chars, stored_opt)
    mstParser.Load(options.model)

    probe_buckets = [list()]
    probe_data = list(utils.read_conll(open(options.conll_test, 'r')))
    for d in probe_data:
        probe_buckets[0].append(d)

    probe_result = probe(mstParser, probe_buckets, options.probe_idx)

    repl_words = [
        word.strip() for word in open(options.repl_words, 'r').readlines()
        if word.strip() != ''
    ]
    write_probes(repl_words, probe_result, options.output_dir)
Exemple #13
0
 def predict(self, conll_path):
     with open(conll_path, 'r') as conllFP:
         for iSentence, sentence in enumerate(read_conll(conllFP)):
             conll_sentence = [
                 entry for entry in sentence
                 if isinstance(entry, utils.ConllEntry)
             ]
             self.model.predict(conll_sentence)
             yield conll_sentence
Exemple #14
0
 def predict(self, conll_path):
     self.model.init()
     with open(conll_path, 'r', encoding='UTF-8') as conllFP:
         for iSentence, sentence in enumerate(read_conll(conllFP, proj=False)):
             self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [self.model.init_hidden(self.model.ldims) for _ in range(4)]
             conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
             conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
             self.model.predict(conll_sentence)
             self.trainer.zero_grad()
             yield sentence
Exemple #15
0
def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))
    filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")]
    filenames = [
        n.replace(".paragraph.boundaries", ".edus") for n in filenames
    ]
    filenames.sort()

    for filename in filenames:
        # Path
        path_edus = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                 "tmp.preprocessing", filename + ".tokenized")
        path_conll = os.path.join(
            config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing",
            filename.replace(".edus", ".sentences.conll"))
        path_out = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "preprocessed", filename + ".postags")

        # Read
        edus = utils.read_lines(
            path_edus,
            process=lambda line: line.split())  # list of list of str
        tokens_e = utils.flatten_lists(edus)  # list of str

        sentences = utils.read_conll(
            path_conll,
            keys=["ID", "FORM", "LEMMA", "POSTAG", "_1", "HEAD",
                  "DEPREL"])  # list of list of {str: str}
        conll_lines = utils.flatten_lists(sentences)  # list of {str: str}
        tokens_s = [conll_line["FORM"]
                    for conll_line in conll_lines]  # list of str
        postags_s = [conll_line["POSTAG"]
                     for conll_line in conll_lines]  # list of str

        # Check whether the number of tokens and that of postags are equivalent
        for token_e, token_s, postag_s in zip(tokens_e, tokens_s, postags_s):
            if token_e != token_s:
                raise ValueError("Error! %s != %s" % (token_e, token_s))

        # Create the POSTAG-version of EDUs
        postag_i = 0
        edus_postag = []
        for edu in edus:
            edu_postag = [postags_s[postag_i + i] for i in range(len(edu))]
            edus_postag.append(edu_postag)
            postag_i += len(edu)

        # Write
        with open(path_out, "w") as f:
            for edu_postag in edus_postag:
                f.write("%s\n" % " ".join(edu_postag))
Exemple #16
0
 def createDebugData(self,treebank,options):
     ext = '.conllu' if self.conllu else '.conll'
     print 'Creating smaller data sets for debugging'
     if not options.predict:
         traindata = list(utils.read_conll(treebank.trainfile,treebank.iso_id,maxSize=options.debug_train_sents,hard_lim=True))
         train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file
         utils.write_conll(train_file,traindata) # write the new dev data to file
         treebank.trainfile = train_file
         if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev:
             devdata = list(utils.read_conll(treebank.devfile,treebank.iso_id,maxSize=options.debug_dev_sents,hard_lim=True))
             dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file
             utils.write_conll(dev_file,devdata) # write the new dev data to file
             treebank.dev_gold = dev_file
             treebank.devfile = dev_file
     else:
        testdata = list(utils.read_conll(treebank.testfile,treebank.iso_id,maxSize=options.debug_test_sents,hard_lim=True))
        test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file
        utils.write_conll(test_file,testdata) # write the new dev data to file
        treebank.test_gold = test_file
        treebank.testfile = test_file
Exemple #17
0
def test(args):

    with tf.Graph().as_default(), create_session(args.use_xla) as session:
        with tf.device(device_placement(args)):
            m = model.Model.load_from(args.model, session)

        with open(args.test_file) as f:
            sentences, trees = utils.read_conll(f, m.vocab, m.tags,
                                                m.relations)

        m.parse(sentences, args.output, print_progress=args.progress)
Exemple #18
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                self.Init()
                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, False)

                for root in forest.roots:
                    root.lstms = [
                        self.builders[0].initial_state().add_input(root.vec),
                        self.builders[1].initial_state().add_input(root.vec)
                    ]

                while len(forest.roots) > 1:

                    self.__evaluate(forest, False)
                    bestParent, bestChild, bestScore = None, None, float(
                        "-inf")
                    bestIndex, bestOp = None, None
                    roots = forest.roots

                    for i in xrange(len(forest.roots) - 1):
                        for irel, rel in enumerate(self.irels):
                            for op in xrange(2):
                                if bestScore < roots[i].scores[irel][op] and (
                                        i + (1 - op)) > 0:
                                    bestParent, bestChild = i + op, i + (1 -
                                                                         op)
                                    bestScore = roots[i].scores[irel][op]
                                    bestIndex, bestOp = i, op
                                    bestRelation, bestIRelation = rel, irel

                    for j in xrange(
                            max(0, bestIndex - self.k - 1),
                            min(len(forest.roots), bestIndex + self.k + 2)):
                        roots[j].scores = None

                    roots[bestChild].pred_parent_id = forest.roots[
                        bestParent].id
                    roots[bestChild].pred_relation = bestRelation

                    roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[
                        bestOp].add_input((self.activation(
                            self.lstm2lstmbias + self.lstm2lstm * concatenate([
                                roots[bestChild].lstms[0].output(),
                                lookup(self.model["rels-lookup"], bestIRelation
                                       ), roots[bestChild].lstms[1].output()
                            ]))))

                    forest.Attach(bestParent, bestChild)

                renew_cg()
                yield sentence
Exemple #19
0
 def predict(self, conll_path):
     with open(conll_path, 'r', encoding='utf-8') as conllFP:
         for iSentence, sentence in enumerate(read_conll(conllFP)):
             self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [
                 self.model.init_hidden(self.model.ldims) for _ in range(4)
             ]
             conll_sentence = [
                 entry for entry in sentence
                 if isinstance(entry, utils.ConllEntry)
             ]
             self.model.predict(conll_sentence)
             yield conll_sentence
Exemple #20
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                self.Init()

                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                self.getWordEmbeddings(conll_sentence, False)
                stack = ParseForest([])
                buf = ParseForest(conll_sentence)

                for root in conll_sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while not (len(buf) == 1 and len(stack) == 0):
                    scores = self.__evaluate(stack, buf, False)
                    best = max(chain(*scores), key = itemgetter(2) )

                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                renew_cg()
                yield sentence
Exemple #21
0
    def train(self, conll_path):
        print('pytorch version:', torch.__version__)
        batch = 1
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        iSentence = 0
        start = time.time()
        with open(conll_path, 'r', encoding='utf-8') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)
            errs = []
            lerrs = []
            for iSentence, sentence in enumerate(shuffledData):
                self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [
                    self.model.init_hidden(self.model.ldims) for _ in range(4)
                ]
                if iSentence % 100 == 0 and iSentence != 0:
                    print('Processing sentence number:', iSentence, 'Loss:',
                          eloss / etotal, 'Errors:', (float(eerrors)) / etotal,
                          'Time',
                          time.time() - start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]
                e = self.model.forward(conll_sentence, errs, lerrs)
                eerrors += e
                eloss += e
                mloss += e
                etotal += len(sentence)
                if iSentence % batch == 0 or len(errs) > 0 or len(lerrs) > 0:
                    if len(errs) > 0 or len(lerrs) > 0:
                        eerrs = torch.sum(cat(errs + lerrs))
                        eerrs.backward()
                        self.trainer.step()
                        errs = []
                        lerrs = []
                self.trainer.zero_grad()
        if len(errs) > 0:
            eerrs = (torch.sum(errs + lerrs))
            eerrs.backward()
            self.trainer.step()
        self.trainer.zero_grad()
        print("Loss: ", mloss / iSentence)
Exemple #22
0
    def train(self, conll_path):
        mloss = 0.0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        hoffset = 1 if self.headFlag else 0
        start = time.time()

        with open(conll_path, 'r', encoding='UTF-8') as conllFP:
            shuffledData = list(read_conll(conllFP, proj=True))
            random.shuffle(shuffledData)
            errs = []
            self.model.init()
            for iSentence, sentence in enumerate(shuffledData):
                self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [self.model.init_hidden(self.model.ldims) for _ in range(4)]
                if iSentence % 100 == 0 and iSentence != 0:
                    print('Processing sentence number:', iSentence,
                          'Loss:', eloss / etotal,
                          'Errors:', (float(eerrors)) / etotal,
                          'Labeled Errors:', (float(lerrors) / etotal) ,
                          'Time', time.time()-start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                dloss, deerrors, dlerrors, detotal = self.model.train(conll_sentence, errs)
                eloss += dloss
                mloss += dloss
                eerrors += deerrors
                lerrors += dlerrors
                etotal += detotal
                if len(errs) > 0: # or True:
                    eerrs = torch.sum(cat(errs))
                    eerrs.backward()
                    self.trainer.step()
                    errs = []
                    self.trainer.zero_grad()
                    self.model.init()
        if len(errs) > 0:
            eerrs = torch.sum(cat(errs)) # * (1.0/(float(len(errs))))
            eerrs.backward()
            self.trainer.step()
            errs = []
            self.trainer.zero_grad()
        self.trainer.step()
        print("Loss: ", mloss/iSentence)
Exemple #23
0
 def test_predict(self, path, epoch):
     test = open(path, 'r')
     testData = list(read_conll(test))
     data_list = utils.construct_parsing_data_list(testData, self.word_dict,
                                                   self.pos_dict)
     batch_test_data = utils.construct_sorted_batch_data(
         data_list, self.test_batch_size)
     tot_batch = len(batch_test_data)
     for batch_id, one_batch in tqdm(enumerate(batch_test_data),
                                     mininterval=2,
                                     desc=' -Tot it %d (epoch %d)' %
                                     (tot_batch, 0),
                                     leave=False,
                                     file=sys.stdout):
         self.model.predict(one_batch)
Exemple #24
0
 def Predict(self, conll_path):
     dev_buckets = [list()]
     dev_data = list(read_conll(conll_path))
     for d in dev_data:
         dev_buckets[0].append(d)
     minibatches = get_batches(dev_buckets, self, False)
     outputs = self.decode(minibatches)
     results = [self.iroles[np.argmax(outputs[i])] for i in range(len(outputs))]
     offset = 0
     for iSentence, sentence in enumerate(dev_data):
         for p in xrange(len(sentence.predicates)):
             for arg_index in xrange(len(sentence.entries)):
                 sentence.entries[arg_index].predicateList[p] = results[offset]
                 offset+=1
         yield sentence
Exemple #25
0
    def train(self, conll_path):
        print torch.__version__
        batch = 1
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        iSentence = 0
        start = time.time()
        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)
            errs = []
            lerrs = []
            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, \
                        'Loss:', eloss / etotal, \
                        'Errors:', (float(eerrors)) / etotal, \
                        'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]
                e = self.model.forward(conll_sentence, errs, lerrs)
                eerrors += e
                eloss += e
                mloss += e
                etotal += len(sentence)
                if iSentence % batch == 0 or len(errs) > 0 or len(lerrs) > 0:
                    if len(errs) > 0 or len(lerrs) > 0:
                        eerrs = torch.sum(cat(errs + lerrs))
                        eerrs.backward()
                        self.trainer.step()
                        errs = []
                        lerrs = []
                self.trainer.zero_grad()
        if len(errs) > 0:
            eerrs = (torch.sum(errs + lerrs))
            eerrs.backward()
            self.trainer.step()
        self.trainer.zero_grad()
        print "Loss: ", mloss / iSentence
Exemple #26
0
    def predict(self, conll_path):
        self.transitionModel.init()
        num_g, num_t = 0, 0
        sentences_g = []
        sentences_t = []
        with open(conll_path, "r", encoding='UTF-8') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                # self.graphModel.hid1, self.graphModel.hid2 = [
                #     self.graphModel.init_hidden(self.graphModel.ldims) for _ in range(2)]
                # self.transitionModel.hid1, self.transitionModel.hid2 = [
                #     self.transitionModel.init_hidden(self.transitionModel.ldims) for _ in range(2)]

                sentence_g = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]
                sentence_t = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]
                sentence_t = sentence_t[1:] + [sentence_t[0]]
                sentences_g.append(sentence_g)
                sentences_t.append(sentence_t)

                # conll_sentence00 = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                # conll_sentence0 = sentence.copy()
                # conll_sentence1 = sentence.copy()
                # conll_sentence1 = conll_sentence1[1:] + [conll_sentence1[0]]

                self.graphModel.predict(sentences_g)
                self.transitionModel.predict(sentences_t)

                sentence_t = [sentence_t[-1]] + sentence_t[:-1]
                # conll_sentence = [conll_sentence0, conll_sentence1]
                # rank = random.randint(0, 1)
                # input = torch.cat((self.graphModel.vec, self.transitionModel.vec), 1)
                # output = self.model.classifier(Variable(input))
                # _, rank = torch.max(torch.abs(output.data), 1)
                # rank = rank[0]
                # num_g += 1 - rank
                # num_t += rank
                sentences_g = []
                sentences_t = []
                yield sentence_t

        print("Graph-based:", num_g, "\nTransition-based:", num_t)
Exemple #27
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0
        ninf = -float('inf')

        hoffset = 1 if self.headFlag else 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) /
                            etotal), 'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                self.getWordEmbeddings(conll_sentence, True)
                stack = ParseForest([])
                buf = ParseForest(conll_sentence)

                for root in conll_sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while not (len(buf) == 1 and len(stack) == 0):
                    scores = self.__evaluate(stack, buf, True)
                    scores.append([(None, 3, ninf, None)])

                    alpha = stack.roots[:-2] if len(stack) > 2 else []
                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
                    b = [buf.roots[0]] if len(buf) > 0 else []
                    beta = buf.roots[1:] if len(buf) > 1 else []

                    left_cost = (
                        len([h
                             for h in s1 + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[0]) > 0 else 1
                    right_cost = (
                        len([h for h in b + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[1]) > 0 else 1
                    shift_cost = (
                        len([h
                             for h in s1 + alpha if h.id == b[0].parent_id]) +
                        len([
                            d
                            for d in s0 + s1 + alpha if d.parent_id == b[0].id
                        ])) if len(scores[2]) > 0 else 1
                    costs = (left_cost, right_cost, shift_cost, 1)

                    bestValid = max(
                        (s for s in chain(*scores) if costs[s[1]] == 0 and (
                            s[1] == 2 or s[0] == stack.roots[-1].relation)),
                        key=itemgetter(2))
                    bestWrong = max(
                        (s for s in chain(*scores) if costs[s[1]] != 0 or (
                            s[1] != 2 and s[0] != stack.roots[-1].relation)),
                        key=itemgetter(2))
                    best = bestValid if (
                        (not self.oracle) or
                        (bestValid[2] - bestWrong[2] > 1.0) or
                        (bestValid[2] > bestWrong[2]
                         and random.random() > 0.1)) else bestWrong

                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    if bestValid[2] < bestWrong[2] + 1.0:
                        loss = bestWrong[3] - bestValid[3]
                        mloss += 1.0 + bestWrong[2] - bestValid[2]
                        eloss += 1.0 + bestWrong[2] - bestValid[2]
                        errs.append(loss)

                    if best[1] != 2 and (
                            child.pred_parent_id != child.parent_id
                            or child.pred_relation != child.relation):
                        lerrors += 1
                        if child.pred_parent_id != child.parent_id:
                            errors += 1
                            eerrors += 1

                    etotal += 1

                if len(errs) > 50:  # or True:
                    #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
                    eerrs = esum(errs)
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs))  # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        # self.trainer.update_epoch()  # hanwj 6.20 . there is no any decay, so just remove it.
        # self.trainer.learning_rate /= (1 - rate_decay)
        print "Loss: ", mloss / iSentence
Exemple #28
0
    id_head = {}
    for entry in sentence:
        id_head[entry.id] = entry.parent_id
    for k, v in id_head.items():
        if (k < v):
            spann = range(k, v + 1)
            nodes = range(k + 1, v)
        else:
            spann = range(v, k + 1)
            nodes = range(v + 1, k)
        for node in nodes:
            if (not (id_head[node] in spann)):
                return True
    return False


count = 0
count_nproj = 0
with open(sys.argv[1], 'r') as conllFP:
    Data = list(read_conll(conllFP, []))
    for sent in Data:
        count += 1
        conll_sent = [
            entry for entry in sent if isinstance(entry, utils.ConllEntry)
        ]
        if (non_proj_sent(conll_sent)):
            count_nproj += 1

print str(format(float(count_nproj) / count * 100,
                 '.2f')) + "%  of total sentences are non-projective"
Exemple #29
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0
        ninf = -float('inf')

        hoffset = 1 if self.headFlag else 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                sentence = sentence[1:] + [sentence[0]]
                self.getWordEmbeddings(sentence, True)
                stack = ParseForest([])
                buf = ParseForest(sentence)

                for root in sentence:
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while len(buf) > 0 or len(stack) > 1 :
                    scores = self.__evaluate(stack, buf, True)
                    scores.append([(None, 3, ninf ,None)])

                    alpha = stack.roots[:-2] if len(stack) > 2 else []
                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
                    b = [buf.roots[0]] if len(buf) > 0 else []
                    beta = buf.roots[1:] if len(buf) > 1 else []

                    left_cost  = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) + 
                                   len([d for d in b + beta if d.parent_id == s0[0].id]) )  if len(scores[0]) > 0 else 1
                    right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) +
                                   len([d for d in b + beta if d.parent_id == s0[0].id]) )  if len(scores[1]) > 0 else 1
                    shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) +
                                   len([d for d in s0 + s1 + alpha if d.parent_id == b[0].id]) )  if len(scores[2]) > 0 else 1
                    costs = (left_cost, right_cost, shift_cost, 1)

                    bestValid = max(( s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or  s[0] == stack.roots[-1].relation ) ), key=itemgetter(2))
                    bestWrong = max(( s for s in chain(*scores) if costs[s[1]] != 0 or  ( s[1] != 2 and s[0] != stack.roots[-1].relation ) ), key=itemgetter(2))
                    best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong

                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    elif best[1] == 0:
                        child = stack.roots.pop()
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    elif best[1] == 1:
                        child = stack.roots.pop()
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    if bestValid[2] < bestWrong[2] + 1.0:
                        loss = bestWrong[3] - bestValid[3]
                        mloss += 1.0 + bestWrong[2] - bestValid[2]
                        eloss += 1.0 + bestWrong[2] - bestValid[2]
                        errs.append(loss)

                    if best[1] != 2 and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation):
                        lerrors += 1
                        if child.pred_parent_id != child.parent_id:
                            errors += 1
                            eerrors += 1

                    etotal += 1

                if len(errs) > 50: # or True:
                    #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
                    eerrs = esum(errs)
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs)) # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss/iSentence
Exemple #30
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP)):
                self.hid2Layer = parameter(self.model["hidden2-layer"])
                self.hid2Bias = parameter(self.model["hidden2-bias"])

                self.hidLayerFOM = parameter(self.model["hidden-layer-fom"])
                self.hidLayerFOH = parameter(self.model["hidden-layer-foh"])
                self.hidBias = parameter(self.model["hidden-bias"])

                self.outLayer = parameter(self.model["output-layer"])

                if self.labelsFlag:
                    self.rhid2Layer = parameter(self.model["rhidden2-layer"])
                    self.rhid2Bias = parameter(self.model["rhidden2-bias"])

                    self.rhidLayerFOM = parameter(self.model["rhidden-layer-fom"])
                    self.rhidLayerFOH = parameter(self.model["rhidden-layer-foh"])
                    self.rhidBias = parameter(self.model["rhidden-bias"])

                    self.routLayer = parameter(self.model["routput-layer"])
                    self.routBias = parameter(self.model["routput-bias"])


                for entry in sentence:
                    wordvec = lookup(self.model["word-lookup"], int(self.vocab.get(entry.norm, 0))) if self.wdims > 0 else None
                    posvec = lookup(self.model["pos-lookup"], int(self.pos[entry.pos])) if self.pdims > 0 else None
                    evec = lookup(self.model["extrn-lookup"], int(self.vocab.get(entry.norm, 0))) if self.external_embedding is not None else None
                    entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(sentence, reversed(sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(sentence, reversed(sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(sentence, True)
                heads = decoder.parse_proj(scores) 

                for entry, head in zip(sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(sentence, head, modifier+1)
                        sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
Exemple #31
0
    def Train(self, conll_path):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            eeloss = 0.0

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c/(0.25+c)))
                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = None

                    if self.external_embedding is not None:
                        evec = self.elookup[self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0]
                    entry.vec = concatenate(filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence, reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)

                if self.labelsFlag:
                    for modifier, head in enumerate(gold[1:]):
                        rscores, rexprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                        goldLabelInd = self.rels[conll_sentence[modifier+1].relation]
                        wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
                        if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                            lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd])

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
                    eeloss = 0.0

                    if len(errs) > 0 or len(lerrs) > 0:
                        eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []

                    renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            eeloss = 0.0

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss/iSentence
Exemple #32
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0
        ninf = -float('inf')

        hoffset = 1 if self.headFlag else 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                # 每处理100个句子,输出信息
                if iSentence % 100 == 0 and iSentence != 0:
                    print '处理第 ', iSentence, ' 个句子,Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) / etotal), '用时', time.time() - start
                    # logger.debug('处理第%s个句子,Loss:%s,Errors:%s,Labeled Errors:%s,用时:%s', iSentence, eloss / etotal, (float(eerrors)) / etotal, (float(lerrors) / etotal), time.time()-start)
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
                self.getWordEmbeddings(conll_sentence, True)
                # 初始化stack为空
                stack = ParseForest([])
                # 将句子放入buf中
                buf = ParseForest(conll_sentence)

                for root in conll_sentence:
                    # 词的LSTM输入为self.nnvecs个输入向量串联
                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]

                hoffset = 1 if self.headFlag else 0

                while not (len(buf) == 1 and len(stack) == 0):
                    scores = self.__evaluate(stack, buf, True)
                    scores.append([(None, 3, ninf, None)])

                    # alpha是栈中其他元素
                    alpha = stack.roots[:-2] if len(stack) > 2 else []
                    # s1为栈顶第二个元素
                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
                    # s0为栈顶第一个元素
                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
                    # b为buffer第一个元素
                    b = [buf.roots[0]] if len(buf) > 0 else []
                    # beta是buffer中其他元素
                    beta = buf.roots[1:] if len(buf) > 1 else []

                    left_cost = (
                        len([h
                             for h in s1 + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[0]) > 0 else 1
                    right_cost = (
                        len([h for h in b + beta if h.id == s0[0].parent_id]) +
                        len([d for d in b + beta if d.parent_id == s0[0].id])
                    ) if len(scores[1]) > 0 else 1
                    shift_cost = (
                        len([h
                             for h in s1 + alpha if h.id == b[0].parent_id]) +
                        len([
                            d
                            for d in s0 + s1 + alpha if d.parent_id == b[0].id
                        ])) if len(scores[2]) > 0 else 1
                    costs = (left_cost, right_cost, shift_cost, 1)

                    bestValid = max(
                        (s for s in chain(*scores) if costs[s[1]] == 0 and (
                            s[1] == 2 or s[0] == stack.roots[-1].relation)),
                        key=itemgetter(2))
                    bestWrong = max(
                        (s for s in chain(*scores) if costs[s[1]] != 0 or (
                            s[1] != 2 and s[0] != stack.roots[-1].relation)),
                        key=itemgetter(2))
                    best = bestValid if (
                        (not self.oracle) or
                        (bestValid[2] - bestWrong[2] > 1.0) or
                        (bestValid[2] > bestWrong[2]
                         and random.random() > 0.1)) else bestWrong

                    # shift,未得到relation
                    if best[1] == 2:
                        stack.roots.append(buf.roots[0])
                        del buf.roots[0]

                    # left,head词是b0
                    elif best[1] == 0:
                        child = stack.roots.pop()
                        # head词是b0
                        parent = buf.roots[0]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 0
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    # right,head词是s0
                    elif best[1] == 1:
                        child = stack.roots.pop()
                        # head词是s0
                        parent = stack.roots[-1]

                        child.pred_parent_id = parent.id
                        child.pred_relation = best[0]

                        bestOp = 1
                        if self.rlMostFlag:
                            parent.lstms[bestOp +
                                         hoffset] = child.lstms[bestOp +
                                                                hoffset]
                        if self.rlFlag:
                            parent.lstms[bestOp + hoffset] = child.vec

                    if bestValid[2] < bestWrong[2] + 1.0:
                        # 损失函数
                        loss = bestWrong[3] - bestValid[3]
                        mloss += 1.0 + bestWrong[2] - bestValid[2]
                        eloss += 1.0 + bestWrong[2] - bestValid[2]
                        errs.append(loss)

                    if best[1] != 2 and (
                            child.pred_parent_id != child.parent_id
                            or child.pred_relation != child.relation):
                        # id或者relation估计不准确,labelederror加1
                        lerrors += 1
                        if child.pred_parent_id != child.parent_id:
                            # id估计不准确,unlabelederror加1
                            errors += 1
                            eerrors += 1

                    etotal += 1

                if len(errs) > 50:  # or True:
                    #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
                    eerrs = esum(errs)
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs))  # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            # 根据损失函数求梯度
            eerrs.backward()
            # 参数更新
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        self.trainer.update()
        print "Loss: ", mloss / iSentence
Exemple #33
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP)):
                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    wordvec = self.wlookup[int(self.vocab.get(
                        entry.norm, 0))] if self.wdims > 0 else None
                    posvec = self.plookup[int(
                        self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = self.elookup[int(
                        self.extrnd.get(entry.form,
                                        self.extrnd.get(entry.norm, 0))
                    )] if self.external_embedding is not None else None
                    entry.vec = concatenate(
                        filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(
                                rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                heads = decoder.parse_proj(scores)

                for entry, head in zip(conll_sentence, heads):
                    entry.pred_parent_id = head
                    entry.pred_relation = '_'

                dump = False

                if self.labelsFlag:
                    for modifier, head in enumerate(heads[1:]):
                        scores, exprs = self.__evaluateLabel(
                            conll_sentence, head, modifier + 1)
                        conll_sentence[modifier +
                                       1].pred_relation = self.irels[max(
                                           enumerate(scores),
                                           key=itemgetter(1))[0]]

                renew_cg()
                if not dump:
                    yield sentence
Exemple #34
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) /
                            etotal), 'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, True)

                for root in forest.roots:
                    root.lstms = [
                        self.builders[0].initial_state().add_input(root.vec),
                        self.builders[1].initial_state().add_input(root.vec)
                    ]

                unassigned = {
                    entry.id: sum([
                        1 for pentry in sentence
                        if pentry.parent_id == entry.id
                    ])
                    for entry in sentence
                }

                while len(forest.roots) > 1:
                    self.__evaluate(forest, True)
                    bestValidOp, bestValidScore = None, float("-inf")
                    bestWrongOp, bestWrongScore = None, float("-inf")

                    bestValidParent, bestValidChild = None, None
                    bestValidIndex, bestWrongIndex = None, None
                    roots = forest.roots

                    rootsIds = set([root.id for root in roots])

                    for i in xrange(len(forest.roots) - 1):
                        for irel, rel in enumerate(self.irels):
                            for op in xrange(2):
                                child = i + (1 - op)
                                parent = i + op

                                oracleCost = unassigned[roots[child].id] + (
                                    0 if roots[child].parent_id not in rootsIds
                                    or roots[child].parent_id
                                    == roots[parent].id else 1)

                                if oracleCost == 0 and (
                                        roots[child].parent_id !=
                                        roots[parent].id
                                        or roots[child].relation == rel):
                                    if bestValidScore < forest.roots[i].scores[
                                            irel][op]:
                                        bestValidScore = forest.roots[
                                            i].scores[irel][op]
                                        bestValidOp = op
                                        bestValidParent, bestValidChild = parent, child
                                        bestValidIndex = i
                                        bestValidIRel, bestValidRel = irel, rel
                                        bestValidExpr = roots[
                                            bestValidIndex].exprs[
                                                bestValidIRel][bestValidOp]
                                elif bestWrongScore < forest.roots[i].scores[
                                        irel][op]:
                                    bestWrongScore = forest.roots[i].scores[
                                        irel][op]
                                    bestWrongParent, bestWrongChild = parent, child
                                    bestWrongOp = op
                                    bestWrongIndex = i
                                    bestWrongIRel, bestWrongRel = irel, rel
                                    bestWrongExpr = roots[
                                        bestWrongIndex].exprs[bestWrongIRel][
                                            bestWrongOp]

                    if bestValidScore < bestWrongScore + 1.0:
                        loss = bestWrongExpr - bestValidExpr
                        mloss += 1.0 + bestWrongScore - bestValidScore
                        eloss += 1.0 + bestWrongScore - bestValidScore
                        errs.append(loss)

                    if not self.oracle or bestValidScore - bestWrongScore > 1.0 or (
                            bestValidScore > bestWrongScore
                            and random.random() > 0.1):
                        selectedOp = bestValidOp
                        selectedParent = bestValidParent
                        selectedChild = bestValidChild
                        selectedIndex = bestValidIndex
                        selectedIRel, selectedRel = bestValidIRel, bestValidRel
                    else:
                        selectedOp = bestWrongOp
                        selectedParent = bestWrongParent
                        selectedChild = bestWrongChild
                        selectedIndex = bestWrongIndex
                        selectedIRel, selectedRel = bestWrongIRel, bestWrongRel

                    if roots[selectedChild].parent_id != roots[
                            selectedParent].id or selectedRel != roots[
                                selectedChild].relation:
                        lerrors += 1
                        if roots[selectedChild].parent_id != roots[
                                selectedParent].id:
                            errors += 1
                            eerrors += 1

                    etotal += 1

                    for j in xrange(
                            max(0, selectedIndex - self.k - 1),
                            min(len(forest.roots),
                                selectedIndex + self.k + 2)):
                        roots[j].scores = None

                    unassigned[roots[selectedChild].parent_id] -= 1

                    roots[selectedParent].lstms[selectedOp] = roots[
                        selectedParent].lstms[selectedOp].add_input(
                            self.activation(self.lstm2lstm * noise(
                                concatenate([
                                    roots[selectedChild].lstms[0].output(),
                                    lookup(self.model["rels-lookup"],
                                           selectedIRel),
                                    roots[selectedChild].lstms[1].output()
                                ]), 0.0) + self.lstm2lstmbias))

                    forest.Attach(selectedParent, selectedChild)

                if len(errs) > 50.0:
                    eerrs = ((esum(errs)) * (1.0 / (float(len(errs)))))
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs)) * (1.0 / (float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss / iSentence
Exemple #35
0
    def Train(self, conll_path):
        errors = 0
        batch = 0
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP))
            random.shuffle(shuffledData)

            errs = []
            lerrs = []
            eeloss = 0.0

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                conll_sentence = [
                    entry for entry in sentence
                    if isinstance(entry, utils.ConllEntry)
                ]

                for entry in conll_sentence:
                    c = float(self.wordsCount.get(entry.norm, 0))
                    dropFlag = (random.random() < (c / (0.25 + c)))
                    wordvec = self.wlookup[
                        int(self.vocab.get(entry.norm, 0)
                            ) if dropFlag else 0] if self.wdims > 0 else None
                    posvec = self.plookup[int(
                        self.pos[entry.pos])] if self.pdims > 0 else None
                    evec = None

                    if self.external_embedding is not None:
                        evec = self.elookup[self.extrnd.get(
                            entry.form, self.extrnd.get(entry.norm, 0)) if
                                            (dropFlag or
                                             (random.random() < 0.5)) else 0]
                    entry.vec = concatenate(
                        filter(None, [wordvec, posvec, evec]))

                    entry.lstms = [entry.vec, entry.vec]
                    entry.headfov = None
                    entry.modfov = None

                    entry.rheadfov = None
                    entry.rmodfov = None

                if self.blstmFlag:
                    lstm_forward = self.builders[0].initial_state()
                    lstm_backward = self.builders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        lstm_forward = lstm_forward.add_input(entry.vec)
                        lstm_backward = lstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = lstm_forward.output()
                        rentry.lstms[0] = lstm_backward.output()

                    if self.bibiFlag:
                        for entry in conll_sentence:
                            entry.vec = concatenate(entry.lstms)

                        blstm_forward = self.bbuilders[0].initial_state()
                        blstm_backward = self.bbuilders[1].initial_state()

                        for entry, rentry in zip(conll_sentence,
                                                 reversed(conll_sentence)):
                            blstm_forward = blstm_forward.add_input(entry.vec)
                            blstm_backward = blstm_backward.add_input(
                                rentry.vec)

                            entry.lstms[1] = blstm_forward.output()
                            rentry.lstms[0] = blstm_backward.output()

                scores, exprs = self.__evaluate(conll_sentence, True)
                gold = [entry.parent_id for entry in conll_sentence]
                heads = decoder.parse_proj(scores,
                                           gold if self.costaugFlag else None)

                if self.labelsFlag:
                    for modifier, head in enumerate(gold[1:]):
                        rscores, rexprs = self.__evaluateLabel(
                            conll_sentence, head, modifier + 1)
                        goldLabelInd = self.rels[conll_sentence[modifier +
                                                                1].relation]
                        wrongLabelInd = max(((l, scr)
                                             for l, scr in enumerate(rscores)
                                             if l != goldLabelInd),
                                            key=itemgetter(1))[0]
                        if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
                            lerrs.append(rexprs[wrongLabelInd] -
                                         rexprs[goldLabelInd])

                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
                eerrors += e
                if e > 0:
                    loss = [(exprs[h][i] - exprs[g][i])
                            for i, (h, g) in enumerate(zip(heads, gold))
                            if h != g]  # * (1.0/float(e))
                    eloss += (e)
                    mloss += (e)
                    errs.extend(loss)

                etotal += len(conll_sentence)

                if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
                    eeloss = 0.0

                    if len(errs) > 0 or len(lerrs) > 0:
                        eerrs = (esum(errs + lerrs)
                                 )  #* (1.0/(float(len(errs))))
                        eerrs.scalar_value()
                        eerrs.backward()
                        self.trainer.update()
                        errs = []
                        lerrs = []

                    renew_cg()

        if len(errs) > 0:
            eerrs = (esum(errs + lerrs))  #* (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []
            eeloss = 0.0

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss / iSentence