def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, False)): self.Init() forest = ParseForest(sentence) self.getWordEmbeddings(forest, False) for root in forest.roots: root.lstms = [ self.builders[0].initial_state().add_input(root.vec), self.builders[1].initial_state().add_input(root.vec) ] while len(forest.roots) > 1: self.__evaluate(forest, False) bestParent, bestChild, bestScore = None, None, float( "-inf") bestIndex, bestOp = None, None roots = forest.roots for i in xrange(len(forest.roots) - 1): for irel, rel in enumerate(self.irels): for op in xrange(2): if bestScore < roots[i].scores[irel][op] and ( i + (1 - op)) > 0: bestParent, bestChild = i + op, i + (1 - op) bestScore = roots[i].scores[irel][op] bestIndex, bestOp = i, op bestRelation, bestIRelation = rel, irel for j in xrange( max(0, bestIndex - self.k - 1), min(len(forest.roots), bestIndex + self.k + 2)): roots[j].scores = None roots[bestChild].pred_parent_id = forest.roots[ bestParent].id roots[bestChild].pred_relation = bestRelation roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[ bestOp].add_input((self.activation( self.lstm2lstmbias + self.lstm2lstm * concatenate([ roots[bestChild].lstms[0].output(), lookup(self.model["rels-lookup"], bestIRelation ), roots[bestChild].lstms[1].output() ])))) forest.Attach(bestParent, bestChild) renew_cg() yield sentence
def Train(self, conll_path): mloss = 0.0 errors = 0 batch = 0 eloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 ltotal = 0 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, True)) random.shuffle(shuffledData) errs = [] eeloss = 0.0 self.Init() for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', ( float(eerrors)) / etotal, 'Labeled Errors:', ( float(lerrors) / etotal), 'Time', time.time() - start start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 forest = ParseForest(sentence) self.getWordEmbeddings(forest, True) for root in forest.roots: root.lstms = [ self.builders[0].initial_state().add_input(root.vec), self.builders[1].initial_state().add_input(root.vec) ] unassigned = { entry.id: sum([ 1 for pentry in sentence if pentry.parent_id == entry.id ]) for entry in sentence } while len(forest.roots) > 1: self.__evaluate(forest, True) bestValidOp, bestValidScore = None, float("-inf") bestWrongOp, bestWrongScore = None, float("-inf") bestValidParent, bestValidChild = None, None bestValidIndex, bestWrongIndex = None, None roots = forest.roots rootsIds = set([root.id for root in roots]) for i in xrange(len(forest.roots) - 1): for irel, rel in enumerate(self.irels): for op in xrange(2): child = i + (1 - op) parent = i + op oracleCost = unassigned[roots[child].id] + ( 0 if roots[child].parent_id not in rootsIds or roots[child].parent_id == roots[parent].id else 1) if oracleCost == 0 and ( roots[child].parent_id != roots[parent].id or roots[child].relation == rel): if bestValidScore < forest.roots[i].scores[ irel][op]: bestValidScore = forest.roots[ i].scores[irel][op] bestValidOp = op bestValidParent, bestValidChild = parent, child bestValidIndex = i bestValidIRel, bestValidRel = irel, rel bestValidExpr = roots[ bestValidIndex].exprs[ bestValidIRel][bestValidOp] elif bestWrongScore < forest.roots[i].scores[ irel][op]: bestWrongScore = forest.roots[i].scores[ irel][op] bestWrongParent, bestWrongChild = parent, child bestWrongOp = op bestWrongIndex = i bestWrongIRel, bestWrongRel = irel, rel bestWrongExpr = roots[ bestWrongIndex].exprs[bestWrongIRel][ bestWrongOp] if bestValidScore < bestWrongScore + 1.0: loss = bestWrongExpr - bestValidExpr mloss += 1.0 + bestWrongScore - bestValidScore eloss += 1.0 + bestWrongScore - bestValidScore errs.append(loss) if not self.oracle or bestValidScore - bestWrongScore > 1.0 or ( bestValidScore > bestWrongScore and random.random() > 0.1): selectedOp = bestValidOp selectedParent = bestValidParent selectedChild = bestValidChild selectedIndex = bestValidIndex selectedIRel, selectedRel = bestValidIRel, bestValidRel else: selectedOp = bestWrongOp selectedParent = bestWrongParent selectedChild = bestWrongChild selectedIndex = bestWrongIndex selectedIRel, selectedRel = bestWrongIRel, bestWrongRel if roots[selectedChild].parent_id != roots[ selectedParent].id or selectedRel != roots[ selectedChild].relation: lerrors += 1 if roots[selectedChild].parent_id != roots[ selectedParent].id: errors += 1 eerrors += 1 etotal += 1 for j in xrange( max(0, selectedIndex - self.k - 1), min(len(forest.roots), selectedIndex + self.k + 2)): roots[j].scores = None unassigned[roots[selectedChild].parent_id] -= 1 roots[selectedParent].lstms[selectedOp] = roots[ selectedParent].lstms[selectedOp].add_input( self.activation(self.lstm2lstm * noise( concatenate([ roots[selectedChild].lstms[0].output(), lookup(self.model["rels-lookup"], selectedIRel), roots[selectedChild].lstms[1].output() ]), 0.0) + self.lstm2lstmbias)) forest.Attach(selectedParent, selectedChild) if len(errs) > 50.0: eerrs = ((esum(errs)) * (1.0 / (float(len(errs))))) scalar_loss = eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.Init() if len(errs) > 0: eerrs = (esum(errs)) * (1.0 / (float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() self.trainer.update_epoch() print "Loss: ", mloss / iSentence
def Train(self, conll_path, options): mloss = 0.0 errors = 0 batch = 0 eloss = 0.0 #eerrors = 0 #lerrors = 0 etotal = 0 #ltotal = 0 max_quotient = float("-inf") min_quotient = float("inf") NUM_SAMPLES = options.num_samples #default 10 start = time.time() with open(conll_path, 'r') as conllFP: shuffledData = list(read_conll(conllFP, True)) random.shuffle(shuffledData) errs = [] #eeloss = 0.0 batch_errs = [] self.Init() for iSentence, sentence in enumerate(shuffledData): if iSentence % 100 == 0 and iSentence != 0: print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Time', time.time( ) - start #print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start start = time.time() #eerrors = 0 eloss = 0.0 etotal = 0 #lerrors = 0 #ltotal = 0 sample_errs = [] sample_quotients = [] #print('Sentence: {}'.format(sentence)) DEBUG = random.random() < 0.0001 if DEBUG: print("Train sentence: {}".format( [e.form for e in sentence])) for _ in xrange(NUM_SAMPLES): forest = ParseForest(sentence) self.getWordEmbeddings(forest, True) for root in forest.roots: root.lstms = [ self.builders[0].initial_state().add_input( root.vec), self.builders[1].initial_state().add_input( root.vec) ] unassigned = { entry.id: sum([ 1 for pentry in sentence if pentry.parent_id == entry.id ]) for entry in sentence } #loss = 0 log_q_total = 0.0 log_p_total = 0.0 while len(forest.roots) > 1: self.__evaluate( forest, True) #NOTE(prkriley): this updates scores roots = forest.roots rootsIds = set([root.id for root in roots]) def _isValid(i): return (unassigned[roots[i].id] == 0) and ( (i > 0 and roots[i].parent_id == roots[i - 1].id) or (i < len(roots) - 1 and roots[i].parent_id == roots[i + 1].id)) valid_zs = [ j for j in xrange(1, len(roots)) if _isValid(j) ] z_scores = concatenate([r.zexpr for r in roots[1:]]) valid_z_scores = concatenate( [roots[j].zexpr for j in valid_zs]) p_zs = softmax(z_scores) #print("P(z): {}".format(p_zs.npvalue())) q_temperature = 16.0 q_zs = softmax(valid_z_scores * 1.0 / q_temperature) q_zs_numpy = q_zs.npvalue() q_zs_numpy /= np.sum(q_zs_numpy) if DEBUG: print("Valid z indices: {}".format(valid_zs)) print("Q(z): {}".format(q_zs_numpy)) valid_i = np.random.choice(len(valid_zs), p=q_zs_numpy) q_z = pick(q_zs, valid_i) i = valid_zs[valid_i] log_q_total += log(q_z).scalar_value() p_z = pick(p_zs, i - 1) log_p_total += log(p_z).scalar_value() irel = list(self.irels).index(roots[i].relation) op = 0 if roots[i].parent_id == roots[i - 1].id else 1 #TODO(prkriley): verify correctness of this index math presoftmax_p_y = [ val for tup in roots[i].exprs for val in tup ] if i < len(roots) - 1: neglog_p_y = pickneglogsoftmax( concatenate(presoftmax_p_y), irel * 2 + op) else: assert op == 0 presoftmax_p_y = presoftmax_p_y[::2] neglog_p_y = pickneglogsoftmax( concatenate(presoftmax_p_y), irel) neglog_p_z = pickneglogsoftmax(z_scores, i - 1) errs.append(neglog_p_y + neglog_p_z) log_p_total -= neglog_p_y.scalar_value() mloss += neglog_p_y.scalar_value() mloss += neglog_p_z.scalar_value() etotal += 1 selectedChild = i selectedIndex = i selectedOp = op selectedParent = i + [-1, 1][op] selectedIRel = irel for j in xrange( max(0, selectedIndex - self.k - 2), min(len(forest.roots), selectedIndex + self.k + 2)): roots[j].scores = None #NOTE(prkriley): counts number of real children that are still gettable unassigned[roots[selectedChild].parent_id] -= 1 #NOTE(prkriley): I think lstms[0] is the right one, [1] is the left... roots[selectedParent].lstms[selectedOp] = roots[ selectedParent].lstms[selectedOp].add_input( self.activation(self.lstm2lstm * noise( concatenate([ roots[selectedChild].lstms[0].output(), lookup(self.model["rels-lookup"], selectedIRel), roots[selectedChild].lstms[1].output() ]), 0.0) + self.lstm2lstmbias)) forest.Attach(selectedParent, selectedChild) #END OF SINGLE SAMPLE #TODO(prkriley): finalize loss, do update, etc eerrs = ( (esum(errs)) * (1.0 / (float(len(errs)))) ) #TODO(prkriley): consider removing this division #eerrs = esum(errs) #TODO(prkriley): scale by p/q which is exp(logp-logq) #print("logp: {}; logq: {}".format(log_p_total, log_q_total)) pq_quotient = np.exp(log_p_total - log_q_total) scaled_pq_quotient = pq_quotient * 1e3 #scaled_pq_quotient = min(scaled_pq_quotient, 1.5e-5) #scaled_pq_quotient = max(scaled_pq_quotient, 1.5e-8) #eerrs *= scaled_pq_quotient #print("P/Q: {}".format(pq_quotient)) max_quotient = max(scaled_pq_quotient, max_quotient) min_quotient = min(scaled_pq_quotient, min_quotient) eloss += eerrs.scalar_value() sample_errs.append(eerrs) sample_quotients.append(scaled_pq_quotient) errs = [] DEBUG = False #END OF SAMPLING #upper_clip = 5e-6 #lower_clip = 2e-8 #scale = 1.0 #if max_quotient < lower_clip: # scale = lower_clip / max_quotient ### #SCALING QUOTIENTS #max_sample_quotient = max(sample_quotients) #if max_sample_quotient > upper_clip: # scale = upper_clip / max_sample_quotient sum_quotients = sum(sample_quotients) PQ_NORMALIZE_SUM = options.pq_norm scale = PQ_NORMALIZE_SUM / sum_quotients sample_quotients = [e * scale for e in sample_quotients] #for q in sample_quotients: # assert q <= upper_clip * 1.1, "Large quotient: {}".format(q) ### if options.use_pq: sample_errs = [ e * q for (e, q) in zip(sample_errs, sample_quotients) ] final_error = esum(sample_errs) if not options.use_pq: assert len(sample_errs) == NUM_SAMPLES final_error *= (1.0 / (float(len(sample_errs)))) #TODO(prkriley): put final_error somewhere and update once we have N of them batch_errs.append(final_error) if len(batch_errs) >= options.batch_size: total_error = esum(batch_errs) total_error.backward() self.trainer.update() batch_errs = [] renew_cg() self.Init() #final_error.backward() #self.trainer.update() #renew_cg() #self.Init() #END OF EPOCH #FILE CLOSE if options.use_pq: print("Max Quotient: {}; Min Quotient: {}".format( max_quotient, min_quotient)) #self.trainer.update_epoch() #TODO(prkriley): verify that AdamTrainer handles everything this did before print "Loss: ", mloss / (iSentence * NUM_SAMPLES)
def Predict(self, conll_path): with open(conll_path, 'r') as conllFP: for iSentence, sentence in enumerate(read_conll(conllFP, False)): print("Sentence: {}".format([e.form for e in sentence])) self.Init() forest = ParseForest(sentence) self.getWordEmbeddings(forest, False) for root in forest.roots: root.lstms = [ self.builders[0].initial_state().add_input(root.vec), self.builders[1].initial_state().add_input(root.vec) ] ### #NOTE(prkriley): looking at truth here, but ONLY for reporting unassigned = { entry.id: sum([ 1 for pentry in sentence if pentry.parent_id == entry.id ]) for entry in sentence } ### while len(forest.roots) > 1: self.__evaluate(forest, False) #bestParent, bestChild, bestScore = None, None, float("-inf") #bestIndex, bestOp = None, None roots = forest.roots ### z_scores = concatenate([r.zexpr for r in roots[1:]]) p_z = softmax(z_scores).npvalue() bestIndex = np.argmax(p_z) + 1 print('P(z): {}'.format(p_z)) print('Best index: {} ({})'.format(bestIndex, roots[bestIndex].form)) valid_exprs = [ val for tup in roots[bestIndex].exprs for val in tup ] if bestIndex == len(roots) - 1: valid_exprs = valid_exprs[::2] p_y = softmax(concatenate(valid_exprs)) max_y_index = np.argmax( p_y.npvalue() ) #NOTE(prkriley): don't need to actually do softmax just to pick max if bestIndex < len(roots) - 1: bestOp = max_y_index % 2 bestIRelation = (max_y_index - bestOp) / 2 else: bestOp = 0 bestIRelation = max_y_index #TODO(prkriley): make sure op is valid bestChild = bestIndex bestParent = bestIndex + [-1, 1][bestOp] bestRelation = self.irels[bestIRelation] ### ### #NOTE(prkriley): again, using truth but only for reporting def _isValid(i): return (unassigned[roots[i].id] == 0) and ( (i > 0 and roots[i].parent_id == roots[i - 1].id) or (i < len(roots) - 1 and roots[i].parent_id == roots[i + 1].id)) valid_zs = [ j for j in xrange(1, len(roots)) if _isValid(j) ] valid_probs = [p_z[j - 1] for j in valid_zs] invalid_probs = [ p_z[j - 1] for j in xrange(1, len(roots)) if j not in valid_zs ] avg_valid_prob = sum(valid_probs) * 1.0 / len( valid_probs) if valid_probs else -1 avg_invalid_prob = sum(invalid_probs) * 1.0 / len( invalid_probs) if invalid_probs else -1 print("Avg valid prob: {}/{} = {}".format( sum(valid_probs), len(valid_probs), avg_valid_prob)) print("Avg invalid prob: {}/{} = {}".format( sum(invalid_probs), len(invalid_probs), avg_invalid_prob)) ### #for j in xrange(max(0, bestIndex - self.k - 1), min(len(forest.roots), bestIndex + self.k + 2)): for j in xrange( max(0, bestIndex - self.k - 2), min(len(forest.roots), bestIndex + self.k + 2)): roots[j].scores = None roots[bestChild].pred_parent_id = forest.roots[ bestParent].id roots[bestChild].pred_relation = bestRelation roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[ bestOp].add_input((self.activation( self.lstm2lstmbias + self.lstm2lstm * concatenate([ roots[bestChild].lstms[0].output(), lookup(self.model["rels-lookup"], bestIRelation ), roots[bestChild].lstms[1].output() ])))) unassigned[roots[bestChild].parent_id] -= 1 forest.Attach(bestParent, bestChild) renew_cg() yield sentence