def createDebugData(self,treebank,options): ext = '.conllu' if options.conllu else '.conll' print 'Creating smaller data sets for debugging' if not options.predict: train_data = list(utils.read_conll(treebank.trainfile,maxSize=options.debug_train_sents,hard_lim=True)) train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file utils.write_conll(train_file,train_data) # write the new dev data to file treebank.trainfile = train_file if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev: dev_data = list(utils.read_conll(treebank.devfile,maxSize=options.debug_dev_sents,hard_lim=True)) dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file utils.write_conll(dev_file,dev_data) # write the new dev data to file # have to create a separate debug gold file if not the same as input file if treebank.dev_gold != treebank.devfile: dev_gold_data = list(utils.read_conll(treebank.dev_gold,maxSize=options.debug_dev_sents,hard_lim=True)) dev_gold_file = os.path.join(treebank.outdir,'dev-gold-debug' + ext) # location for the new dev file utils.write_conll(dev_gold_file,dev_gold_data) # write the new dev gold data to file treebank.dev_gold = dev_gold_file else: treebank.dev_gold = dev_file treebank.devfile = dev_file # important to do this last else: test_data = list(utils.read_conll(treebank.testfile,maxSize=options.debug_test_sents,hard_lim=True)) test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file utils.write_conll(test_file,test_data) # write the new dev data to file if treebank.test_gold != treebank.testfile: test_gold_data = list(utils.read_conll(treebank.test_gold,maxSize=options.debug_test_sents,hard_lim=True)) test_gold_file = os.path.join(treebank.outdir,'test-gold-debug' + ext) # location for the new dev file utils.write_conll(test_gold_file,test_gold_data) # write the new dev data to file treebank.test_gold = test_gold_file else: treebank.test_gold = test_file treebank.testfile = test_file
def readdata(src_corpus_name, tgt_corpus_name, shuffle=True, seed=34): """ Read in src and tgt data, and only shuffle the target data. """ src_reader = utils.read_conll(src_corpus_name) tgt_reader = utils.read_conll(tgt_corpus_name) src_data, tgt_data = list(src_reader), list(tgt_reader) if shuffle: # Only shuffle tgt. tgt_data = shuffle_data(tgt_data, seed) return src_data, tgt_data
def write_new_split(corpus_name, test_size, filedir, filename, seed = 42, max_count = 2): """ Do stratified random sampling for a given corpus given by corpus_name, and save the results in file filename in directory filedir. Parameter test_size indicates the number of sentences to be used in the test set. For information on the parameter max_count, see the documentation for function stratified_split. For now, this only supports stratified_split at the sentence level. >>> TRAIN, TEST = write_new_split('CADEC', 1000, filedir, 'cadec', max_count = 2) >>> TRAIN, TEST = write_new_split('re3d', 200, filedir, 're3d', max_count = 2) >>> TRAIN, TEST = write_new_split('GUM', 1000, filedir, 'gum', max_count = 2) >>> TRAIN, TEST = write_new_split('MUC6', 1000, filedir, 'muc6', max_count = 2) >>> TRAIN, TEST = write_new_split('NIST_IEER99', 690, filedir, 'nist', max_count = 2) >>> TRAIN, TEST = write_new_split('BBN', 10000, filedir, 'bbn', max_count = 2) >>> TRAIN, TEST = write_new_split('GMB1', 1000, filedir, 'gmb1', max_count = 2) """ r = utils.read_conll(corpus_name) sentences = list(r) train_data, test_data = stratified_split(sentences, test_size, seed = seed, max_count = max_count) writefile(train_data, os.path.join(filedir,'train'), filename+'-train.conll') writefile(test_data, os.path.join(filedir,'test'), filename+'-test.conll') return train_data, test_data
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, False)): self.Init() conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.getWordEmbeddings(conll_sentence, False) stack = ParseForest([]) buf = ParseForest(conll_sentence) for root in conll_sentence: root.lstms = [root.vec for _ in xrange(self.nnvecs)] hoffset = 1 if self.headFlag else 0 while not (len(buf) == 1 and len(stack) == 0): scores = self.__evaluate(stack, buf, False) best = max(chain(*scores), key=itemgetter(2)) if best[1] == 2: stack.roots.append(buf.roots[0]) del buf.roots[0] elif best[1] == 0: child = stack.roots.pop() parent = buf.roots[0] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 0 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec elif best[1] == 1: child = stack.roots.pop() parent = stack.roots[-1] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 1 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec renew_cg() yield sentence
def train(args): with tf.Graph().as_default(), create_session(args.use_xla) as session: vocab, tags, relations = utils.extract_vocab(args.train_file) with open(args.train_file) as f: sentences, trees = utils.read_conll(f, vocab, tags, relations, True) with tf.device(device_placement(args)): m = model.Model(args.embedding_size, args.hidden_layer_size, vocab, tags, relations, session, activation=args.activation, l2_weight=args.l2, learning_rate=args.learning_rate) init = tf.global_variables_initializer() session.run(init) m.train(trees, batch_size=args.batch_size, epochs=args.epochs, dropout_keep_prob=args.dropout_keep_prob) m.save_to(args.save_to)
def prepareDev(self,treebank,options): treebank.pred_dev = options.pred_dev # even if options.pred_dev is True, might change treebank.pred_dev to False later if no dev data available if not treebank.devfile or not os.path.exists(treebank.devfile): if options.create_dev: # create some dev data from the training data train_data = list(utils.read_conll(treebank.trainfile)) tot_sen = len(train_data) if tot_sen > options.min_train_sents: # need to have at least min_train_sents to move forward dev_file = os.path.join(treebank.outdir,'dev-split' + '.conllu') # location for the new dev file train_file = os.path.join(treebank.outdir,'train-split' + '.conllu') # location for the new train file dev_len = int(0.01*options.dev_percent*tot_sen) print ("Taking " + str(dev_len) + " of " + str(tot_sen) + " sentences from training data as new dev data for " + treebank.name) random.shuffle(train_data) dev_data = train_data[:dev_len] utils.write_conll(dev_file,dev_data) # write the new dev data to file train_data = train_data[dev_len:] # put the rest of the training data in a new file too utils.write_conll(train_file,train_data) # update some variables with the new file locations treebank.dev_gold = dev_file treebank.devfile = dev_file treebank.trainfile = train_file else: # not enough sentences print ("Warning: not enough sentences in training data to create dev set for " + treebank.name + " (minimum required --min-train-size: " + str(options.min_train_sents) + ")") treebank.pred_dev = False else: # option --create-dev not set print ("Warning: No dev data for " + treebank.name + ", consider adding option --create-dev to create dev data from training set") treebank.pred_dev = False if options.model_selection and not treebank.pred_dev: print "Warning: can't do model selection for " + treebank.name + " as prediction on dev data is off"
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None evec = self.elookup[int(self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)))] if self.external_embedding is not None else None entry.vec = concatenate(filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def ensemble(files, outfile): """ Takes conllu files as input """ conllu_files = [] for f in files: cf = utils.read_conll(f) conllu_files.append(cf) zipped_sentences = zip(*conllu_files) decoder = DependencyDecoder() sentences_out = [] for zipped_sentence in zipped_sentences: conll_sentence = [ entry for entry in zipped_sentence[0] if isinstance(entry, utils.ConllEntry) ] n_words = len(conll_sentence) m = np.zeros((n_words, n_words)) for i_sentence in zipped_sentence: conll_sen = [ entry for entry in i_sentence if isinstance(entry, utils.ConllEntry) ] for item in conll_sen: head = item.parent_id dep = item.id m[head, dep] += 1 #NOTE: this takes the label of the first! heads = decoder.parse_nonproj(m) for entry in zipped_sentence[0]: if isinstance(entry, utils.ConllEntry): entry.pred_parent_id = heads[entry.id] sentences_out.append(zipped_sentence[0]) utils.write_conll(outfile, sentences_out)
def build_dataset(self, filename, skip_mwt): """Reads an input CoNLL-U file and returns a list of ConlluToken objects for each token in a sentence.""" print("Building dataset using {}".format(filename)) print("Skipping MWTs: {}".format(skip_mwt)) annotated_sentences, comment_lines = read_conll(filename, skip_mwt) vocab = buildVocab(annotated_sentences, cutoff=1) return annotated_sentences, vocab, comment_lines
def get_embeddings(): """ Obtain and trim the word embeddings in both the source and target datasets. This will try to use the vocabularies of: 'GUM', 're3d', 'BBN', 'i2b2-14', 'BBN','i2b2-14', 'i2b2-06', 'CADEC', 'TwitterRitter', 'MITRestaurantCorpus', 'MITMovieCorpus-trivia10k13', 'MUC6', 'NIST_IEER99', 'GMB1', as well as for CONLL 2003. If a dataset is not found, it is skipped over. """ embeddingsPath = 'word_embeddings/glove.6B.' + WVDIM + 'd.txt.gz' print("Getting vocab from various datasets...") dnames = [ 'GUM', 're3d', 'BBN', 'i2b2-14', 'i2b2-06', 'CADEC', 'TwitterRitter', 'MITRestaurantCorpus', 'MITMovieCorpus-trivia10k13', 'MUC6', 'NIST_IEER99', 'GMB1' ] try: conll03 = list(utils.read_conll('CONLL03')) except: raise ValueError("Could not find CONLL 2003 dataset.") aggregation = [] for dname in dnames: try: dataset = list(utils.read_conll(dname)) aggregation.extend(dataset) except: print(dname + " could not be found.") aggregation = conll03 + aggregation words = lc.get_word2idx2(aggregation) max_len = lc.get_maxlen(aggregation) # NOTE: max_len was 253 for our experiments. print("Getting word embeddings...") we, w2i = embedding_utils.get_word_embeddings(embeddingsPath, words) return max_len, we, w2i, words
def train(self, conll_path): print('pytorch version:', torch.__version__) batch = 1 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 iSentence = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) for iSentence, sentence in enumerate(shuffledData): # print("Initializing hidden and cell states values to 0") self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [ self.model.init_hidden(self.model.ldims) for _ in range(4) ] # if iSentence == 0: # print('hidLayerFOM values on first iteration within an epoch') # print(self.model.hidLayerFOM) if iSentence % 100 == 0 and iSentence != 0: print('Processing sentence number:', iSentence, 'eloss:', eloss, 'etotal:', etotal, 'Loss:', eloss / etotal, 'eerrors:', float(eerrors), 'Errors:', (float(eerrors)) / etotal, 'Time', time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 # print('hidLayerFOM values:') # print(self.model.hidLayerFOM) conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] e_output, errs, lerrs = self.model.forward(conll_sentence) eerrors += e_output eloss += e_output mloss += e_output etotal += len(sentence) if iSentence % batch == 0 or len(errs) > 0 or len(lerrs) > 0: if len(errs) > 0 or len(lerrs) > 0: reshaped_lerrs = [item.reshape(1) for item in lerrs] l_variable = errs + reshaped_lerrs eerrs_sum = torch.sum(concatenate_tensors( l_variable)) # This result is a 1d-tensor eerrs_sum.backward( ) # automatically calculates gradient (backpropagation) # self.print_model_parameters() self.trainer.step( ) # optimizer.step to update weights(to see uncomment print_model_parameters) # self.print_model_parameters() self.trainer.zero_grad() print("Loss: ", mloss / iSentence)
def main(): MSTParserLSTM.rnn_mlp = rnn_mlp MSTParserLSTM.bilinear = bilinear MSTParserLSTM.build_graph = build_graph parser = OptionParser() parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE", default="../data/sskip.100.vectors.gz") parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="../models/params.pickle") parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="../models/model-135") parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default=None) parser.add_option("--repl-words", dest="repl_words", help="Words at probing position", metavar="FILE", default="../probe/repl.words") parser.add_option("--output-dir", dest="output_dir", help="Output directory", default="../probe/1") parser.add_option("--probe-index", type="int", dest="probe_idx", default=0) (options, args) = parser.parse_args() with open(options.params, 'r') as paramsfp: w2i, pos, rels, chars, stored_opt = pkl.load(paramsfp) stored_opt.external_embedding = options.external_embedding mstParser = MSTParserLSTM(pos, rels, w2i, chars, stored_opt) mstParser.Load(options.model) probe_buckets = [list()] probe_data = list(utils.read_conll(open(options.conll_test, 'r'))) for d in probe_data: probe_buckets[0].append(d) probe_result = probe(mstParser, probe_buckets, options.probe_idx) repl_words = [ word.strip() for word in open(options.repl_words, 'r').readlines() if word.strip() != '' ] write_probes(repl_words, probe_result, options.output_dir)
def predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] self.model.predict(conll_sentence) yield conll_sentence
def predict(self, conll_path): self.model.init() with open(conll_path, 'r', encoding='UTF-8') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, proj=False)): self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [self.model.init_hidden(self.model.ldims) for _ in range(4)] conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.model.predict(conll_sentence) self.trainer.zero_grad() yield sentence
def main(): config = utils.Config() filenames = os.listdir( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed")) filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")] filenames = [ n.replace(".paragraph.boundaries", ".edus") for n in filenames ] filenames.sort() for filename in filenames: # Path path_edus = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing", filename + ".tokenized") path_conll = os.path.join( config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing", filename.replace(".edus", ".sentences.conll")) path_out = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed", filename + ".postags") # Read edus = utils.read_lines( path_edus, process=lambda line: line.split()) # list of list of str tokens_e = utils.flatten_lists(edus) # list of str sentences = utils.read_conll( path_conll, keys=["ID", "FORM", "LEMMA", "POSTAG", "_1", "HEAD", "DEPREL"]) # list of list of {str: str} conll_lines = utils.flatten_lists(sentences) # list of {str: str} tokens_s = [conll_line["FORM"] for conll_line in conll_lines] # list of str postags_s = [conll_line["POSTAG"] for conll_line in conll_lines] # list of str # Check whether the number of tokens and that of postags are equivalent for token_e, token_s, postag_s in zip(tokens_e, tokens_s, postags_s): if token_e != token_s: raise ValueError("Error! %s != %s" % (token_e, token_s)) # Create the POSTAG-version of EDUs postag_i = 0 edus_postag = [] for edu in edus: edu_postag = [postags_s[postag_i + i] for i in range(len(edu))] edus_postag.append(edu_postag) postag_i += len(edu) # Write with open(path_out, "w") as f: for edu_postag in edus_postag: f.write("%s\n" % " ".join(edu_postag))
def createDebugData(self,treebank,options): ext = '.conllu' if self.conllu else '.conll' print 'Creating smaller data sets for debugging' if not options.predict: traindata = list(utils.read_conll(treebank.trainfile,treebank.iso_id,maxSize=options.debug_train_sents,hard_lim=True)) train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file utils.write_conll(train_file,traindata) # write the new dev data to file treebank.trainfile = train_file if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev: devdata = list(utils.read_conll(treebank.devfile,treebank.iso_id,maxSize=options.debug_dev_sents,hard_lim=True)) dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file utils.write_conll(dev_file,devdata) # write the new dev data to file treebank.dev_gold = dev_file treebank.devfile = dev_file else: testdata = list(utils.read_conll(treebank.testfile,treebank.iso_id,maxSize=options.debug_test_sents,hard_lim=True)) test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file utils.write_conll(test_file,testdata) # write the new dev data to file treebank.test_gold = test_file treebank.testfile = test_file
def test(args): with tf.Graph().as_default(), create_session(args.use_xla) as session: with tf.device(device_placement(args)): m = model.Model.load_from(args.model, session) with open(args.test_file) as f: sentences, trees = utils.read_conll(f, m.vocab, m.tags, m.relations) m.parse(sentences, args.output, print_progress=args.progress)
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, False)): self.Init() forest = ParseForest(sentence) self.getWordEmbeddings(forest, False) for root in forest.roots: root.lstms = [ self.builders[0].initial_state().add_input(root.vec), self.builders[1].initial_state().add_input(root.vec) ] while len(forest.roots) > 1: self.__evaluate(forest, False) bestParent, bestChild, bestScore = None, None, float( "-inf") bestIndex, bestOp = None, None roots = forest.roots for i in xrange(len(forest.roots) - 1): for irel, rel in enumerate(self.irels): for op in xrange(2): if bestScore < roots[i].scores[irel][op] and ( i + (1 - op)) > 0: bestParent, bestChild = i + op, i + (1 - op) bestScore = roots[i].scores[irel][op] bestIndex, bestOp = i, op bestRelation, bestIRelation = rel, irel for j in xrange( max(0, bestIndex - self.k - 1), min(len(forest.roots), bestIndex + self.k + 2)): roots[j].scores = None roots[bestChild].pred_parent_id = forest.roots[ bestParent].id roots[bestChild].pred_relation = bestRelation roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[ bestOp].add_input((self.activation( self.lstm2lstmbias + self.lstm2lstm * concatenate([ roots[bestChild].lstms[0].output(), lookup(self.model["rels-lookup"], bestIRelation ), roots[bestChild].lstms[1].output() ])))) forest.Attach(bestParent, bestChild) renew_cg() yield sentence
def predict(self, conll_path): with open(conll_path, 'r', encoding='utf-8') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [ self.model.init_hidden(self.model.ldims) for _ in range(4) ] conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] self.model.predict(conll_sentence) yield conll_sentence
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, False)): self.Init() conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.getWordEmbeddings(conll_sentence, False) stack = ParseForest([]) buf = ParseForest(conll_sentence) for root in conll_sentence: root.lstms = [root.vec for _ in xrange(self.nnvecs)] hoffset = 1 if self.headFlag else 0 while not (len(buf) == 1 and len(stack) == 0): scores = self.__evaluate(stack, buf, False) best = max(chain(*scores), key = itemgetter(2) ) if best[1] == 2: stack.roots.append(buf.roots[0]) del buf.roots[0] elif best[1] == 0: child = stack.roots.pop() parent = buf.roots[0] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 0 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec elif best[1] == 1: child = stack.roots.pop() parent = stack.roots[-1] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 1 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec renew_cg() yield sentence
def train(self, conll_path): print('pytorch version:', torch.__version__) batch = 1 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 iSentence = 0 start = time.time() with open(conll_path, 'r', encoding='utf-8') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) errs = [] lerrs = [] for iSentence, sentence in enumerate(shuffledData): self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [ self.model.init_hidden(self.model.ldims) for _ in range(4) ] if iSentence % 100 == 0 and iSentence != 0: print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] e = self.model.forward(conll_sentence, errs, lerrs) eerrors += e eloss += e mloss += e etotal += len(sentence) if iSentence % batch == 0 or len(errs) > 0 or len(lerrs) > 0: if len(errs) > 0 or len(lerrs) > 0: eerrs = torch.sum(cat(errs + lerrs)) eerrs.backward() self.trainer.step() errs = [] lerrs = [] self.trainer.zero_grad() if len(errs) > 0: eerrs = (torch.sum(errs + lerrs)) eerrs.backward() self.trainer.step() self.trainer.zero_grad() print("Loss: ", mloss / iSentence)
def train(self, conll_path): mloss = 0.0 eloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 hoffset = 1 if self.headFlag else 0 start = time.time() with open(conll_path, 'r', encoding='UTF-8') as conllFP: shuffledData = list(read_conll(conllFP, proj=True)) random.shuffle(shuffledData) errs = [] self.model.init() for iSentence, sentence in enumerate(shuffledData): self.model.hid_for_1, self.model.hid_back_1, self.model.hid_for_2, self.model.hid_back_2 = [self.model.init_hidden(self.model.ldims) for _ in range(4)] if iSentence % 100 == 0 and iSentence != 0: print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] dloss, deerrors, dlerrors, detotal = self.model.train(conll_sentence, errs) eloss += dloss mloss += dloss eerrors += deerrors lerrors += dlerrors etotal += detotal if len(errs) > 0: # or True: eerrs = torch.sum(cat(errs)) eerrs.backward() self.trainer.step() errs = [] self.trainer.zero_grad() self.model.init() if len(errs) > 0: eerrs = torch.sum(cat(errs)) # * (1.0/(float(len(errs)))) eerrs.backward() self.trainer.step() errs = [] self.trainer.zero_grad() self.trainer.step() print("Loss: ", mloss/iSentence)
def test_predict(self, path, epoch): test = open(path, 'r') testData = list(read_conll(test)) data_list = utils.construct_parsing_data_list(testData, self.word_dict, self.pos_dict) batch_test_data = utils.construct_sorted_batch_data( data_list, self.test_batch_size) tot_batch = len(batch_test_data) for batch_id, one_batch in tqdm(enumerate(batch_test_data), mininterval=2, desc=' -Tot it %d (epoch %d)' % (tot_batch, 0), leave=False, file=sys.stdout): self.model.predict(one_batch)
def Predict(self, conll_path): dev_buckets = [list()] dev_data = list(read_conll(conll_path)) for d in dev_data: dev_buckets[0].append(d) minibatches = get_batches(dev_buckets, self, False) outputs = self.decode(minibatches) results = [self.iroles[np.argmax(outputs[i])] for i in range(len(outputs))] offset = 0 for iSentence, sentence in enumerate(dev_data): for p in xrange(len(sentence.predicates)): for arg_index in xrange(len(sentence.entries)): sentence.entries[arg_index].predicateList[p] = results[offset] offset+=1 yield sentence
def train(self, conll_path): print torch.__version__ batch = 1 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 iSentence = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) errs = [] lerrs = [] for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, \ 'Loss:', eloss / etotal, \ 'Errors:', (float(eerrors)) / etotal, \ 'Time', time.time() - start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] e = self.model.forward(conll_sentence, errs, lerrs) eerrors += e eloss += e mloss += e etotal += len(sentence) if iSentence % batch == 0 or len(errs) > 0 or len(lerrs) > 0: if len(errs) > 0 or len(lerrs) > 0: eerrs = torch.sum(cat(errs + lerrs)) eerrs.backward() self.trainer.step() errs = [] lerrs = [] self.trainer.zero_grad() if len(errs) > 0: eerrs = (torch.sum(errs + lerrs)) eerrs.backward() self.trainer.step() self.trainer.zero_grad() print "Loss: ", mloss / iSentence
def predict(self, conll_path): self.transitionModel.init() num_g, num_t = 0, 0 sentences_g = [] sentences_t = [] with open(conll_path, "r", encoding='UTF-8') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, False)): # self.graphModel.hid1, self.graphModel.hid2 = [ # self.graphModel.init_hidden(self.graphModel.ldims) for _ in range(2)] # self.transitionModel.hid1, self.transitionModel.hid2 = [ # self.transitionModel.init_hidden(self.transitionModel.ldims) for _ in range(2)] sentence_g = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] sentence_t = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] sentence_t = sentence_t[1:] + [sentence_t[0]] sentences_g.append(sentence_g) sentences_t.append(sentence_t) # conll_sentence00 = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] # conll_sentence0 = sentence.copy() # conll_sentence1 = sentence.copy() # conll_sentence1 = conll_sentence1[1:] + [conll_sentence1[0]] self.graphModel.predict(sentences_g) self.transitionModel.predict(sentences_t) sentence_t = [sentence_t[-1]] + sentence_t[:-1] # conll_sentence = [conll_sentence0, conll_sentence1] # rank = random.randint(0, 1) # input = torch.cat((self.graphModel.vec, self.transitionModel.vec), 1) # output = self.model.classifier(Variable(input)) # _, rank = torch.max(torch.abs(output.data), 1) # rank = rank[0] # num_g += 1 - rank # num_t += rank sentences_g = [] sentences_t = [] yield sentence_t print("Graph-based:", num_g, "\nTransition-based:", num_t)
def Train(self, conll_path): mloss = 0.0 errors = 0 batch = 0 eloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 ltotal = 0 ninf = -float('inf') hoffset = 1 if self.headFlag else 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, True)) random.shuffle(shuffledData) errs = [] eeloss = 0.0 self.Init() for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Labeled Errors:', ( float(lerrors) / etotal), 'Time', time.time() - start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.getWordEmbeddings(conll_sentence, True) stack = ParseForest([]) buf = ParseForest(conll_sentence) for root in conll_sentence: root.lstms = [root.vec for _ in xrange(self.nnvecs)] hoffset = 1 if self.headFlag else 0 while not (len(buf) == 1 and len(stack) == 0): scores = self.__evaluate(stack, buf, True) scores.append([(None, 3, ninf, None)]) alpha = stack.roots[:-2] if len(stack) > 2 else [] s1 = [stack.roots[-2]] if len(stack) > 1 else [] s0 = [stack.roots[-1]] if len(stack) > 0 else [] b = [buf.roots[0]] if len(buf) > 0 else [] beta = buf.roots[1:] if len(buf) > 1 else [] left_cost = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[0]) > 0 else 1 right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[1]) > 0 else 1 shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) + len([ d for d in s0 + s1 + alpha if d.parent_id == b[0].id ])) if len(scores[2]) > 0 else 1 costs = (left_cost, right_cost, shift_cost, 1) bestValid = max( (s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or s[0] == stack.roots[-1].relation)), key=itemgetter(2)) bestWrong = max( (s for s in chain(*scores) if costs[s[1]] != 0 or ( s[1] != 2 and s[0] != stack.roots[-1].relation)), key=itemgetter(2)) best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1)) else bestWrong if best[1] == 2: stack.roots.append(buf.roots[0]) del buf.roots[0] elif best[1] == 0: child = stack.roots.pop() parent = buf.roots[0] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 0 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec elif best[1] == 1: child = stack.roots.pop() parent = stack.roots[-1] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 1 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec if bestValid[2] < bestWrong[2] + 1.0: loss = bestWrong[3] - bestValid[3] mloss += 1.0 + bestWrong[2] - bestValid[2] eloss += 1.0 + bestWrong[2] - bestValid[2] errs.append(loss) if best[1] != 2 and ( child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): lerrors += 1 if child.pred_parent_id != child.parent_id: errors += 1 eerrors += 1 etotal += 1 if len(errs) > 50: # or True: #eerrs = ((esum(errs)) * (1.0/(float(len(errs))))) eerrs = esum(errs) scalar_loss = eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.Init() if len(errs) > 0: eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() # self.trainer.update_epoch() # hanwj 6.20 . there is no any decay, so just remove it. # self.trainer.learning_rate /= (1 - rate_decay) print "Loss: ", mloss / iSentence
id_head = {} for entry in sentence: id_head[entry.id] = entry.parent_id for k, v in id_head.items(): if (k < v): spann = range(k, v + 1) nodes = range(k + 1, v) else: spann = range(v, k + 1) nodes = range(v + 1, k) for node in nodes: if (not (id_head[node] in spann)): return True return False count = 0 count_nproj = 0 with open(sys.argv[1], 'r') as conllFP: Data = list(read_conll(conllFP, [])) for sent in Data: count += 1 conll_sent = [ entry for entry in sent if isinstance(entry, utils.ConllEntry) ] if (non_proj_sent(conll_sent)): count_nproj += 1 print str(format(float(count_nproj) / count * 100, '.2f')) + "% of total sentences are non-projective"
def Train(self, conll_path): mloss = 0.0 errors = 0 batch = 0 eloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 ltotal = 0 ninf = -float('inf') hoffset = 1 if self.headFlag else 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, True)) random.shuffle(shuffledData) errs = [] eeloss = 0.0 self.Init() for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 sentence = sentence[1:] + [sentence[0]] self.getWordEmbeddings(sentence, True) stack = ParseForest([]) buf = ParseForest(sentence) for root in sentence: root.lstms = [root.vec for _ in xrange(self.nnvecs)] hoffset = 1 if self.headFlag else 0 while len(buf) > 0 or len(stack) > 1 : scores = self.__evaluate(stack, buf, True) scores.append([(None, 3, ninf ,None)]) alpha = stack.roots[:-2] if len(stack) > 2 else [] s1 = [stack.roots[-2]] if len(stack) > 1 else [] s0 = [stack.roots[-1]] if len(stack) > 0 else [] b = [buf.roots[0]] if len(buf) > 0 else [] beta = buf.roots[1:] if len(buf) > 1 else [] left_cost = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[0]) > 0 else 1 right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[1]) > 0 else 1 shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) + len([d for d in s0 + s1 + alpha if d.parent_id == b[0].id]) ) if len(scores[2]) > 0 else 1 costs = (left_cost, right_cost, shift_cost, 1) bestValid = max(( s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or s[0] == stack.roots[-1].relation ) ), key=itemgetter(2)) bestWrong = max(( s for s in chain(*scores) if costs[s[1]] != 0 or ( s[1] != 2 and s[0] != stack.roots[-1].relation ) ), key=itemgetter(2)) best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong if best[1] == 2: stack.roots.append(buf.roots[0]) del buf.roots[0] elif best[1] == 0: child = stack.roots.pop() parent = buf.roots[0] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 0 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec elif best[1] == 1: child = stack.roots.pop() parent = stack.roots[-1] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 1 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec if bestValid[2] < bestWrong[2] + 1.0: loss = bestWrong[3] - bestValid[3] mloss += 1.0 + bestWrong[2] - bestValid[2] eloss += 1.0 + bestWrong[2] - bestValid[2] errs.append(loss) if best[1] != 2 and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): lerrors += 1 if child.pred_parent_id != child.parent_id: errors += 1 eerrors += 1 etotal += 1 if len(errs) > 50: # or True: #eerrs = ((esum(errs)) * (1.0/(float(len(errs))))) eerrs = esum(errs) scalar_loss = eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.Init() if len(errs) > 0: eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.trainer.update_epoch() print "Loss: ", mloss/iSentence
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): self.hid2Layer = parameter(self.model["hidden2-layer"]) self.hid2Bias = parameter(self.model["hidden2-bias"]) self.hidLayerFOM = parameter(self.model["hidden-layer-fom"]) self.hidLayerFOH = parameter(self.model["hidden-layer-foh"]) self.hidBias = parameter(self.model["hidden-bias"]) self.outLayer = parameter(self.model["output-layer"]) if self.labelsFlag: self.rhid2Layer = parameter(self.model["rhidden2-layer"]) self.rhid2Bias = parameter(self.model["rhidden2-bias"]) self.rhidLayerFOM = parameter(self.model["rhidden-layer-fom"]) self.rhidLayerFOH = parameter(self.model["rhidden-layer-foh"]) self.rhidBias = parameter(self.model["rhidden-bias"]) self.routLayer = parameter(self.model["routput-layer"]) self.routBias = parameter(self.model["routput-bias"]) for entry in sentence: wordvec = lookup(self.model["word-lookup"], int(self.vocab.get(entry.norm, 0))) if self.wdims > 0 else None posvec = lookup(self.model["pos-lookup"], int(self.pos[entry.pos])) if self.pdims > 0 else None evec = lookup(self.model["extrn-lookup"], int(self.vocab.get(entry.norm, 0))) if self.external_embedding is not None else None entry.vec = concatenate(filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(sentence, reversed(sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(sentence, reversed(sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(sentence, head, modifier+1) sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def Train(self, conll_path): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) errs = [] lerrs = [] eeloss = 0.0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c/(0.25+c))) wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup[self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] entry.vec = concatenate(filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel(conll_sentence, head, modifier+1) goldLabelInd = self.rels[conll_sentence[modifier+1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 renew_cg() self.trainer.update_epoch() print "Loss: ", mloss/iSentence
def Train(self, conll_path): mloss = 0.0 errors = 0 batch = 0 eloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 ltotal = 0 ninf = -float('inf') hoffset = 1 if self.headFlag else 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, True)) random.shuffle(shuffledData) errs = [] eeloss = 0.0 self.Init() for iSentence, sentence in enumerate(shuffledData): # 每处理100个句子,输出信息 if iSentence % 100 == 0 and iSentence != 0: print '处理第 ', iSentence, ' 个句子,Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Labeled Errors:', ( float(lerrors) / etotal), '用时', time.time() - start # logger.debug('处理第%s个句子,Loss:%s,Errors:%s,Labeled Errors:%s,用时:%s', iSentence, eloss / etotal, (float(eerrors)) / etotal, (float(lerrors) / etotal), time.time()-start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.getWordEmbeddings(conll_sentence, True) # 初始化stack为空 stack = ParseForest([]) # 将句子放入buf中 buf = ParseForest(conll_sentence) for root in conll_sentence: # 词的LSTM输入为self.nnvecs个输入向量串联 root.lstms = [root.vec for _ in xrange(self.nnvecs)] hoffset = 1 if self.headFlag else 0 while not (len(buf) == 1 and len(stack) == 0): scores = self.__evaluate(stack, buf, True) scores.append([(None, 3, ninf, None)]) # alpha是栈中其他元素 alpha = stack.roots[:-2] if len(stack) > 2 else [] # s1为栈顶第二个元素 s1 = [stack.roots[-2]] if len(stack) > 1 else [] # s0为栈顶第一个元素 s0 = [stack.roots[-1]] if len(stack) > 0 else [] # b为buffer第一个元素 b = [buf.roots[0]] if len(buf) > 0 else [] # beta是buffer中其他元素 beta = buf.roots[1:] if len(buf) > 1 else [] left_cost = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[0]) > 0 else 1 right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[1]) > 0 else 1 shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) + len([ d for d in s0 + s1 + alpha if d.parent_id == b[0].id ])) if len(scores[2]) > 0 else 1 costs = (left_cost, right_cost, shift_cost, 1) bestValid = max( (s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or s[0] == stack.roots[-1].relation)), key=itemgetter(2)) bestWrong = max( (s for s in chain(*scores) if costs[s[1]] != 0 or ( s[1] != 2 and s[0] != stack.roots[-1].relation)), key=itemgetter(2)) best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1)) else bestWrong # shift,未得到relation if best[1] == 2: stack.roots.append(buf.roots[0]) del buf.roots[0] # left,head词是b0 elif best[1] == 0: child = stack.roots.pop() # head词是b0 parent = buf.roots[0] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 0 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec # right,head词是s0 elif best[1] == 1: child = stack.roots.pop() # head词是s0 parent = stack.roots[-1] child.pred_parent_id = parent.id child.pred_relation = best[0] bestOp = 1 if self.rlMostFlag: parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] if self.rlFlag: parent.lstms[bestOp + hoffset] = child.vec if bestValid[2] < bestWrong[2] + 1.0: # 损失函数 loss = bestWrong[3] - bestValid[3] mloss += 1.0 + bestWrong[2] - bestValid[2] eloss += 1.0 + bestWrong[2] - bestValid[2] errs.append(loss) if best[1] != 2 and ( child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): # id或者relation估计不准确,labelederror加1 lerrors += 1 if child.pred_parent_id != child.parent_id: # id估计不准确,unlabelederror加1 errors += 1 eerrors += 1 etotal += 1 if len(errs) > 50: # or True: #eerrs = ((esum(errs)) * (1.0/(float(len(errs))))) eerrs = esum(errs) scalar_loss = eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.Init() if len(errs) > 0: eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) eerrs.scalar_value() # 根据损失函数求梯度 eerrs.backward() # 参数更新 self.trainer.update() errs = [] lerrs = [] renew_cg() self.trainer.update() print "Loss: ", mloss / iSentence
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP)): conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: wordvec = self.wlookup[int(self.vocab.get( entry.norm, 0))] if self.wdims > 0 else None posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None evec = self.elookup[int( self.extrnd.get(entry.form, self.extrnd.get(entry.norm, 0)) )] if self.external_embedding is not None else None entry.vec = concatenate( filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) heads = decoder.parse_proj(scores) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' dump = False if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) conll_sentence[modifier + 1].pred_relation = self.irels[max( enumerate(scores), key=itemgetter(1))[0]] renew_cg() if not dump: yield sentence
def Train(self, conll_path): mloss = 0.0 errors = 0 batch = 0 eloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 ltotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, True)) random.shuffle(shuffledData) errs = [] eeloss = 0.0 self.Init() for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Labeled Errors:', ( float(lerrors) / etotal), 'Time', time.time() - start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 forest = ParseForest(sentence) self.getWordEmbeddings(forest, True) for root in forest.roots: root.lstms = [ self.builders[0].initial_state().add_input(root.vec), self.builders[1].initial_state().add_input(root.vec) ] unassigned = { entry.id: sum([ 1 for pentry in sentence if pentry.parent_id == entry.id ]) for entry in sentence } while len(forest.roots) > 1: self.__evaluate(forest, True) bestValidOp, bestValidScore = None, float("-inf") bestWrongOp, bestWrongScore = None, float("-inf") bestValidParent, bestValidChild = None, None bestValidIndex, bestWrongIndex = None, None roots = forest.roots rootsIds = set([root.id for root in roots]) for i in xrange(len(forest.roots) - 1): for irel, rel in enumerate(self.irels): for op in xrange(2): child = i + (1 - op) parent = i + op oracleCost = unassigned[roots[child].id] + ( 0 if roots[child].parent_id not in rootsIds or roots[child].parent_id == roots[parent].id else 1) if oracleCost == 0 and ( roots[child].parent_id != roots[parent].id or roots[child].relation == rel): if bestValidScore < forest.roots[i].scores[ irel][op]: bestValidScore = forest.roots[ i].scores[irel][op] bestValidOp = op bestValidParent, bestValidChild = parent, child bestValidIndex = i bestValidIRel, bestValidRel = irel, rel bestValidExpr = roots[ bestValidIndex].exprs[ bestValidIRel][bestValidOp] elif bestWrongScore < forest.roots[i].scores[ irel][op]: bestWrongScore = forest.roots[i].scores[ irel][op] bestWrongParent, bestWrongChild = parent, child bestWrongOp = op bestWrongIndex = i bestWrongIRel, bestWrongRel = irel, rel bestWrongExpr = roots[ bestWrongIndex].exprs[bestWrongIRel][ bestWrongOp] if bestValidScore < bestWrongScore + 1.0: loss = bestWrongExpr - bestValidExpr mloss += 1.0 + bestWrongScore - bestValidScore eloss += 1.0 + bestWrongScore - bestValidScore errs.append(loss) if not self.oracle or bestValidScore - bestWrongScore > 1.0 or ( bestValidScore > bestWrongScore and random.random() > 0.1): selectedOp = bestValidOp selectedParent = bestValidParent selectedChild = bestValidChild selectedIndex = bestValidIndex selectedIRel, selectedRel = bestValidIRel, bestValidRel else: selectedOp = bestWrongOp selectedParent = bestWrongParent selectedChild = bestWrongChild selectedIndex = bestWrongIndex selectedIRel, selectedRel = bestWrongIRel, bestWrongRel if roots[selectedChild].parent_id != roots[ selectedParent].id or selectedRel != roots[ selectedChild].relation: lerrors += 1 if roots[selectedChild].parent_id != roots[ selectedParent].id: errors += 1 eerrors += 1 etotal += 1 for j in xrange( max(0, selectedIndex - self.k - 1), min(len(forest.roots), selectedIndex + self.k + 2)): roots[j].scores = None unassigned[roots[selectedChild].parent_id] -= 1 roots[selectedParent].lstms[selectedOp] = roots[ selectedParent].lstms[selectedOp].add_input( self.activation(self.lstm2lstm * noise( concatenate([ roots[selectedChild].lstms[0].output(), lookup(self.model["rels-lookup"], selectedIRel), roots[selectedChild].lstms[1].output() ]), 0.0) + self.lstm2lstmbias)) forest.Attach(selectedParent, selectedChild) if len(errs) > 50.0: eerrs = ((esum(errs)) * (1.0 / (float(len(errs))))) scalar_loss = eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.Init() if len(errs) > 0: eerrs = (esum(errs)) * (1.0 / (float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.trainer.update_epoch() print "Loss: ", mloss / iSentence
def Train(self, conll_path): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP)) random.shuffle(shuffledData) errs = [] lerrs = [] eeloss = 0.0 for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Time', time.time() - start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for entry in conll_sentence: c = float(self.wordsCount.get(entry.norm, 0)) dropFlag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[ int(self.vocab.get(entry.norm, 0) ) if dropFlag else 0] if self.wdims > 0 else None posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None evec = None if self.external_embedding is not None: evec = self.elookup[self.extrnd.get( entry.form, self.extrnd.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] entry.vec = concatenate( filter(None, [wordvec, posvec, evec])) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstmFlag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibiFlag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input( rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel( conll_sentence, head, modifier + 1) goldLabelInd = self.rels[conll_sentence[modifier + 1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += (e) mloss += (e) errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (esum(errs + lerrs) ) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() if len(errs) > 0: eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 renew_cg() self.trainer.update_epoch() print "Loss: ", mloss / iSentence