def read_problem(self, sent_file=None, gold_file=None, feat_dir=None): # read sents if not sent_file: sent_file = 'dat/%s/sents/%s.aligned' % (self.task, self.id) gold_file = 'dat/%s/gold/%s.aligned' % (self.task, self.id) with open(sent_file, 'r') as fin: self.sent = fin.readline().strip().split() self.N = len(self.sent) fin.readline() self.deps = [int(x) for x in fin.readline().strip().split()] with open(gold_file, 'r') as fin: self.gold_sent = fin.readline().strip().split() fin.readline() self.gold_deps = [int(x) for x in fin.readline().strip().split()] # compression rate self.compression = len(self.gold_sent) / float(self.N) # read feats # ori and ref feats for dir_ in ['ori', 'ref']: if not feat_dir: ffiles = ['dat/%s/feats/%s/%s/%s.bin' % (self.task, dir_, ftype, self.id) for ftype in ['uni', 'bi', 'dep']] else: ffiles = ['%s/%s/%s/%s.bin' % (feat_dir, dir_, ftype, self.id) for ftype in ['uni', 'bi', 'dep']] # Use different Feature object for sent and gold sent # because load_feat needs to know the input dimension if dir_ == 'ori': ori_feature = Feature(self.sent, None, None, None) self.feats_ind = ori_feature.ind feature = ori_feature else: # TODO: this constructor is computing more info than necessary to load feature ref_feature = StructFeature(self.gold_sent, None, self.gold_deps, None, ori_feature) feature = ref_feature # change array feats to mydefaultdict feats feat_dicts = [[] for _ in range(FType.SIZE)] for ftype, ffile in zip(range(FType.SIZE), ffiles): # feats is an array of vectors feats = feature.load_feat(ftype, ffile) for feat in feats: d = mydefaultdict(mydouble) for f in feat: d[f] = 1 feat_dicts[ftype].append(d) if dir_ == 'ori': # feature factory self.feats = feat_dicts else: # sum of all features in the gold sentence self.gold_feats = mydefaultdict(mydouble) # uni, bi, dep for feats in feat_dicts: for feat in feats: self.gold_feats.iadd(feat)
def get_curr_feature(self): feats = mydefaultdict(mydouble) # unigram feats_factory = self.feats[FType.UNI] for pos, idx in self.feats_ind[FType.UNI].items(): i, = pos var = self.model.getVarByName('u_%d' % (i+1)) if abs(var.x - 1.0) < 1e-5: feats.iadd(feats_factory[idx]) # bigram feats_factory = self.feats[FType.BI] for pos, idx in self.feats_ind[FType.BI].items(): i, j = pos var = self.model.getVarByName('b_%d_%d' % (i+1, j+1)) if abs(var.x - 1.0) < 1e-5: feats.iadd(feats_factory[idx]) # edge feats_factory = self.feats[FType.DEP] for pos, idx in self.feats_ind[FType.DEP].items(): h, m = pos var = self.model.getVarByName('e_%d_%d' % (h+1, m+1)) if abs(var.x - 1.0) < 1e-5: feats.iadd(feats_factory[idx]) return feats
def __init__(self, feat_vector, costs=None): self.featureVector = mydefaultdict(mydouble) for key, val in feat_vector.items(): self.featureVector[key] = val self.costs = costs if self.costs != None: self._normalize_costs()
def read_weights(self, filepath): w = mydefaultdict(mydouble) with open(filepath, 'rb') as fin: for line in fin: ss = line.strip().split('\t') w[ss[0]] = float(ss[1]) return w
def get_curr_feature(self): feats = mydefaultdict(mydouble) # bigrams for idx, feat in self.bigram_feats.items(): var = self.model.getVarByName('b_%d' % idx) if abs(var.x - 1.0) < 1e-5: feats.iadd(feat) #self.selected_bigrams = bigrams #self.selected_bigrams_feat = bigrams_feat # average # TODO: check average #self._average_feat(bigrams_feat, len(bigrams)) # edges for (sent_id, hid, mid), feat in self.edge_feats.items(): var = self.model.getVarByName('e_%d_%d_%d' % (sent_id, hid, mid)) if abs(var.x - 1.0) < 1e-5: feats.iadd(feat) #self.cut_edges = edges #self.cut_edges_feat = edges_feat # average #self._average_feat(edges_feat, len(edges)) return feats
def get_curr_feature(self): # bigrams feats = mydefaultdict(mydouble) for idx, feat in self.bigram_feats.items(): var = self.model.getVarByName('b_%d' % idx) if abs(var.x - 1.0) < 1e-5: feats.iadd(feat) return feats
def _get_feature(self, string): """ get feature vector from ':' separated feature string """ feat = mydefaultdict(mydouble) for name, value in [x.split(':') for x in string.split(' ')]: feat[name] = float(value) return feat
def removeHapaxLegomena(instances): print "Counting features" feature2counts = mydefaultdict(mydouble) for instance in instances: for element in instance.featureVector: feature2counts[element] += 1 print "Removing hapax legomena" newInstances = [] for instance in instances: newFeatureVector = mydefaultdict(mydouble) for element in instance.featureVector: # if this feature was encountered more than once if feature2counts[element] > 1: newFeatureVector[element] = instance.featureVector[element] newInstances.append(Instance(newFeatureVector, instance.costs)) return newInstances
def resorted(self): new = WVector() for action, feats in self.iteritems(): new[action] = mydefaultdict(WVector.value_class, sorted(feats.items())) del self return new
def __init__(self, value_class=None): if value_class is None: value_class = WVector.value_class # can add new actions on the fly; doesn't need to specify list of actions a priori # TODO: lambda : mydefaultdict(value_class) defaultdict.__init__( self, mydefaultdict, [(action, mydefaultdict(value_class)) for action in WVector.action_names]) # doublehash 1
def test_instance_from_list(featureList): featureVector = mydefaultdict(mydouble) for featureID,featureVal in enumerate(featureList): # This makes word features sparse if featureVal!=0: featureVector[featureID] = featureVal # print "Test feature vector is",featureVector return Instance(featureVector)
def _initialize_vectors(self, instances, averaging, rounds, adapt): """ Initialize the weight vectors in the beginning of training. We have one variance and one weight vector per class. """ self.currentWeightVectors = {} if adapt: self.currentVarianceVectors = {} if averaging: averagedWeightVectors = {} updatesLeft = rounds * len(instances) for label in instances[0].costs: self.currentWeightVectors[label] = mydefaultdict(mydouble) # remember: this is sparse in the sense that everething that doesn't have a value is 1 # everytime we to do something with it, remember to add 1 if adapt: self.currentVarianceVectors[label] = {} # keep the averaged weight vector if averaging: averagedWeightVectors[label] = mydefaultdict(mydouble) return averagedWeightVectors, updatesLeft
def train_instance_from_list(costDict, featureList): """ Generate an Instance from a set of training instances with costs. The input is a list of features e.g. words, vectorized. Note, the vectorizer is the only way of mapping each word to a unique ID. """ featureVector = mydefaultdict(mydouble) for featureID,featureVal in enumerate(featureList): # This makes word features sparse if featureVal!=0: featureVector[featureID] = featureVal return Instance(featureVector, costDict)
def removeHapaxLegomena(instances): """ Hapax Legomena are features that appear only once in the whole dataset. This static method remove these features from the dataset. """ print "Counting features" feature2counts = mydefaultdict(mydouble) for instance in instances: for element in instance.featureVector: feature2counts[element] += 1 print len(feature2counts) print "Removing hapax legomena" newInstances = [] for instance in instances: newFeatureVector = mydefaultdict(mydouble) for element in instance.featureVector: # if this feature was encountered more than once if feature2counts[element] > 1: newFeatureVector[element] = instance.featureVector[element] newInstances.append(Instance(newFeatureVector, instance.costs)) return newInstances
def train(self, instances, averaging=True, shuffling=True, rounds=10, param=1, adapt=True): """ Train the classifier. If adapt is False then we have PA-II with prediction-based updates. If adapt is True then we have AROW. The param value is only used in AROW, not in PA-II. """ # This is a bit nasty, averagedWeightVectors will be None if # averaging is False. Setting it as an instance attribute # might be better. averagedWeightVectors, updatesLeft = self._initialize_vectors(instances, averaging, rounds, adapt) for r in xrange(rounds): if shuffling: random.shuffle(instances) errorsInRound = 0 costInRound = 0 for instance in instances: prediction = self.predict(instance) # so if the prediction was incorrect # we are no longer large margin, since we are using the loss from the cost-sensitive PA # print "Instance costs are",instance.costs # print "Prediction label is",prediction.label if instance.costs[prediction.label] > 0: errorsInRound += 1 costInRound += instance.costs[prediction.label] self._update_parameters(instance, prediction, averaging, adapt, param, averagedWeightVectors, updatesLeft) if averaging: updatesLeft-=1 print "Training error rate in round " + str(r) + " : " + str(float(errorsInRound) / len(instances)) if averaging: for label in self.currentWeightVectors: self.currentWeightVectors[label] = mydefaultdict(mydouble) self.currentWeightVectors[label].iaddc(averagedWeightVectors[label], 1.0/float(rounds*len(instances))) # Compute the final training error: finalTrainingErrors = 0 finalTrainingCost = 0 for instance in instances: prediction = self.predict(instance) if instance.costs[prediction.label] > 0: finalTrainingErrors +=1 finalTrainingCost += instance.costs[prediction.label] finalTrainingErrorRate = float(finalTrainingErrors)/len(instances) print "Final training error rate=" + str(finalTrainingErrorRate) print "Final training cost=" + str(finalTrainingCost) return finalTrainingCost
def load(self, filename): model_weights = open(filename, 'r') weightVectors = pickle.load(model_weights) model_weights.close() for label, weightVector in weightVectors.items(): self.currentWeightVectors[label] = mydefaultdict(mydouble, weightVector) try: with gzip.open(filename + "_probVectors.gz", "rb") as probFile: print "loading probabilities" pickleDictProbVectors = pickle.load(probFile) self.probWeightVectors = [] for sample in pickleDictProbVectors: label2Vectors = {} for label,vector in sample.items(): label2Vectors[label] = mydefaultdict(mydouble, vector) self.probWeightVectors.append(label2Vectors) probFile.close() self.probabilities = True except IOError: print 'No weight vectors for probability estimates' self.probabilities = False
def read_problem(self): sent_file = 'dat/%s/sents/%s.aligned' % (self.task, self.id) bigram_feature_file = 'dat/%s/features/%s.bigram.feat' % (self.task, self.id) bigram_pos_file = 'dat/%s/features/%s.bigram.pos' % (self.task, self.id) gold_bigram_feature_file = 'dat/%s/solutions/maxrouge/%s.bigram' % (self.task, self.id) self.sents = [] # read sentences and deps with open(sent_file, 'r') as fin: while True: line = fin.readline() if line == '': break self.sents.append(line.strip().split('\t')) fin.readline() # toks fin.readline() # stems fin.readline() # tags fin.readline() # deps fin.readline() # labels fin.readline() # empty line # read bigram features # build bigram dict # bigram_feats[bigram_id] = hvector feat with open(bigram_feature_file, 'r') as fin: self.bigram_feats = {} self.bigrams = {} for i, line in enumerate(fin): ss = line.strip().split('\t') self.bigrams[ss[0]] = i self.bigram_feats[i] = self._get_feature(ss[1]) self.bigram_ids = {idx: bigram for bigram, idx in self.bigrams.items()} # read bigram position with open(bigram_pos_file, 'r') as fin: self.bigram_pos = {} for line in fin: ss = line.strip().split('\t') self.bigram_pos[self.bigrams[ss[0]]] = [tuple([int(x) for x in pos.split('_')]) for pos in ss[1].split(' ')] if self.train: # read gold bigrams and features with open(gold_bigram_feature_file, 'r') as fin: self.gold_bigrams = [] self.gold_feats = mydefaultdict(mydouble) for s in fin: bigram_id = self.bigrams[s.strip()] self.gold_bigrams.append(bigram_id) self.gold_feats.iadd(self.bigram_feats[bigram_id])
def train(self, instances, averaging=True, shuffling=True, rounds=10, param=1, adapt=True): """ Train the classifier. If adapt is False then we have PA-II with prediction-based updates. If adapt is True then we have AROW. The param value is only used in AROW, not in PA-II. """ # This is a bit nasty, averagedWeightVectors will be None if # averaging is False. Setting it as an instance attribute # might be better. averagedWeightVectors, updatesLeft = self._initialize_vectors(instances, averaging, rounds, adapt) for r in xrange(rounds): if shuffling: random.shuffle(instances) errorsInRound = 0 costInRound = 0 for instance in instances: prediction = self.predict(instance) # so if the prediction was incorrect # we are no longer large margin, since we are using the loss from the cost-sensitive PA if instance.costs[prediction.label] > 0: errorsInRound += 1 costInRound += instance.costs[prediction.label] self._update_parameters(instance, prediction, averaging, adapt, param, averagedWeightVectors, updatesLeft) if averaging: updatesLeft-=1 print "Training error rate in round " + str(r) + " : " + str(float(errorsInRound) / len(instances)) if averaging: for label in self.currentWeightVectors: self.currentWeightVectors[label] = mydefaultdict(mydouble) self.currentWeightVectors[label].iaddc(averagedWeightVectors[label], 1.0/float(rounds*len(instances))) # Compute the final training error: finalTrainingErrors = 0 finalTrainingCost = 0 for instance in instances: prediction = self.predict(instance) if instance.costs[prediction.label] > 0: finalTrainingErrors +=1 finalTrainingCost += instance.costs[prediction.label] finalTrainingErrorRate = float(finalTrainingErrors)/len(instances) print "Final training error rate=" + str(finalTrainingErrorRate) print "Final training cost=" + str(finalTrainingCost) return finalTrainingCost
def instance_from_svm_input(svm_input): """ Generate an Instance from a SVMLight input. """ feat_vec = mydefaultdict(mydouble) costs = {} splitted = svm_input.split() if splitted[0] == "-1": costs["neg"] = 0 costs["pos"] = 1 elif splitted[0] == "+1": costs["neg"] = 1 costs["pos"] = 0 for elem in splitted[1:]: fid, val = elem.split(':') feat_vec[fid] = float(val) return Instance(feat_vec, costs)
def train_instance_from_svm_input(line): """ Generate an Instance from a set of training instances with costs """ details = line.split("|") costs = {} featureVector = mydefaultdict(mydouble) costDict = details[0].split() featureDict = details[1].split() for pair in costDict: label, cost = pair.split(":") # Account for infinity # if cost is not "inf" else float(1e10) costs[label] = float(cost) for featureID,featureVal in enumerate(featureDict): featureVector[featureID] = float(id(intern(featureVal))) return Instance(featureVector, costs)
def probGeneration(self, scale=1.0, noWeightVectors=100): # initialize the weight vectors print "Generating samples for the weight vectors to obtain probability estimates" self.probWeightVectors = [] for i in xrange(noWeightVectors): self.probWeightVectors.append({}) for label in self.currentWeightVectors: self.probWeightVectors[i][label] = mydefaultdict(mydouble) for label in self.currentWeightVectors: # We are ignoring features that never got their weight set for feature in self.currentWeightVectors[label]: # note that if the weight was updated, then the variance must have been updated too, i.e. we shouldn't have 0s weights = numpy.random.normal(self.currentWeightVectors[label][feature], scale * self.currentVarianceVectors[label][feature], noWeightVectors) # we got the samples, now let's put them in the right places for i,weight in enumerate(weights): self.probWeightVectors[i][label][feature] = weight print "done" self.probabilities = True
def get_delta_feature(self): """ Get the difference between the features of the current structure and the gold structure """ curr = self.get_curr_feature() delta = mydefaultdict(mydouble) delta.iaddc(self.gold_feats, 1) delta.iaddc(curr, -1) if debug_level > 0: print 'predicted feat:' print curr.items()[:10] print 'gold feat:' print self.gold_feats.items()[:10] print 'delta feat:' print delta.items()[:10] return delta
if __name__ == "__main__": import sys import random random.seed(13) numpy.random.seed(13) dataLines = open(sys.argv[1]).readlines() instances = [] classifier_p = AROW() print "Reading the data" for line in dataLines: details = line.split() costs = {} featureVector = mydefaultdict(mydouble) if details[0] == "-1": costs["neg"] = 0 costs["pos"] = 1 elif details[0] == "+1": costs["neg"] = 1 costs["pos"] = 0 for feature in details[1:]: featureID, featureVal = feature.split(":") featureVector[featureID] = float(featureVal) instances.append(Instance(featureVector, costs)) #print instances[-1].costs random.shuffle(instances)
print x x += 5 print x x += mydouble(2) print x print x.__copy__() x = mydouble() print x # x += "1" # print x d = mydefaultdict(mydouble) # always like that d["a"] = 1 # no need to say mydouble(1); transparent to the user print d d.iadd(d) print "d=", d d.iaddc(d, 0.5) print "d=", d c = mydefaultdict(mydouble) # always like that print c c += d #c.__iadd__(d)
problem = SummaryProblemJointDep('tac09', 'D0901A-A') print problem.sents[0] #print problem.labels[0] print problem.deps[0] #print problem.bigrams['of control'] #print problem.bigram_feats[0] #print problem.edge_feats[0,2,0] #print problem.bigram_pos[0] #print problem.gold_edges #print problem.gold_bigrams #print [problem.bigram_ids[i] for i in problem.gold_bigrams] #edge_weights = mydefaultdict(mydouble) #bigram_weights = mydefaultdict(mydouble) #bigram_weights['doc_ratio'] = 1.0 weights = mydefaultdict(mydouble) weights['doc_ratio'] = 1.0 start = time.clock() problem.build_ilp() print 'building ilp time:', time.clock() - start problem.model.write('D0901A-A.gurobi.lp') #start = time.clock() #problem.solve() #print 'solving ilp time:', time.clock() - start #bigrams, bigrams_feat, edges, edges_feat = problem.get_curr_feature() #print bigrams #print sorted(edges, key=lambda x: x[0]) #print [problem.bigram_ids[x] for x in bigrams]
if __name__ == "__main__": import sys import random random.seed(13) numpy.random.seed(13) dataLines = open(sys.argv[1]).readlines() instances = [] classifier_p = AROW() print "Reading the data" for line in dataLines: details = line.split() costs = {} featureVector = mydefaultdict(mydouble) if details[0] == "-1": costs["neg"] = 0 costs["pos"] = 1 elif details[0] == "+1": costs["neg"] = 1 costs["pos"] = 0 for feature in details[1:]: featureID, featureVal = feature.split(":") featureVector[featureID] = float(featureVal) #featureVector["dummy"+str(len(instances))] = 1.0 #featureVector["dummy2"+str(len(instances))] = 1.0 #featureVector["dummy3"+str(len(instances))] = 1.0 instances.append(Instance(featureVector, costs))
def train(self, instances, averaging=True, shuffling=True, rounds = 10, param = 1, adapt=True): # we first need to go through the dataset to find how many classes # Initialize the weight vectors in the beginning of training" # we have one variance and one weight vector per class self.currentWeightVectors = {} if adapt: self.currentVarianceVectors = {} if averaging: averagedWeightVectors = {} updatesLeft = rounds*len(instances) for label in instances[0].costs: self.currentWeightVectors[label] = mydefaultdict(mydouble) # remember: this is sparse in the sense that everething that doesn't have a value is 1 # everytime we to do something with it, remember to add 1 if adapt: self.currentVarianceVectors[label] = {} # keep the averaged weight vector if averaging: averagedWeightVectors[label] = mydefaultdict(mydouble) # in each iteration for r in range(rounds): # shuffle if shuffling: random.shuffle(instances) errorsInRound = 0 costInRound = 0 # for each instance for instance in instances: prediction = self.predict(instance) # so if the prediction was incorrect # we are no longer large margin, since we are using the loss from the cost-sensitive PA if instance.costs[prediction.label] > 0: errorsInRound += 1 costInRound += instance.costs[prediction.label] # first we need to get the score for the correct answer # if the instance has more than one correct answer then pick the min minCorrectLabelScore = float("inf") minCorrectLabel = None for label in instance.correctLabels: score = instance.featureVector.dot(self.currentWeightVectors[label]) if score < minCorrectLabelScore: minCorrectLabelScore = score minCorrectLabel = label # the loss is the scaled margin loss also used by Mejer and Crammer 2010 loss = prediction.score - minCorrectLabelScore + math.sqrt(instance.costs[prediction.label]) if adapt: # Calculate the confidence values # first for the predicted label zVectorPredicted = mydefaultdict(mydouble) zVectorMinCorrect = mydefaultdict(mydouble) for feature in instance.featureVector: # the variance is either some value that is in the dict or just 1 if feature in self.currentVarianceVectors[prediction.label]: zVectorPredicted[feature] = instance.featureVector[feature] * self.currentVarianceVectors[prediction.label][feature] else: zVectorPredicted[feature] = instance.featureVector[feature] # then for the minCorrect: if feature in self.currentVarianceVectors[minCorrectLabel]: zVectorMinCorrect[feature] = instance.featureVector[feature] * self.currentVarianceVectors[minCorrectLabel][feature] else: zVectorMinCorrect[feature] = instance.featureVector[feature] confidence = zVectorPredicted.dot(instance.featureVector) + zVectorMinCorrect.dot(instance.featureVector) beta = 1.0/(confidence + param) alpha = loss * beta # update the current weight vectors self.currentWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha) self.currentWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha) if averaging: averagedWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha * updatesLeft) averagedWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha * updatesLeft) else: # the squared norm is twice the square of the features since they are the same per class norm = 2*(instance.featureVector.dot(instance.featureVector)) factor = loss/(norm + float(1)/(2*param)) self.currentWeightVectors[prediction.label].iaddc(instance.featureVector, -factor) self.currentWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor) if averaging: averagedWeightVectors[prediction.label].iaddc(instance.featureVector, -factor * updatesLeft) averagedWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor * updatesLeft) if adapt: # update the diagonal covariance for feature in instance.featureVector.iterkeys(): # for the predicted if feature in self.currentVarianceVectors[prediction.label]: self.currentVarianceVectors[prediction.label][feature] -= beta * pow(zVectorPredicted[feature],2) else: # Never updated this covariance before, add 1 self.currentVarianceVectors[prediction.label][feature] = 1 - beta * pow(zVectorPredicted[feature],2) # for the minCorrect if feature in self.currentVarianceVectors[minCorrectLabel]: self.currentVarianceVectors[minCorrectLabel][feature] -= beta * pow(zVectorMinCorrect[feature],2) else: # Never updated this covariance before, add 1 self.currentVarianceVectors[minCorrectLabel][feature] = 1 - beta * pow(zVectorMinCorrect[feature],2) if averaging: updatesLeft-=1 print "Training error rate in round " + str(r) + " : " + str(float(errorsInRound)/len(instances)) if averaging: for label in self.currentWeightVectors: self.currentWeightVectors[label] = mydefaultdict(mydouble) self.currentWeightVectors[label].iaddc(averagedWeightVectors[label], 1.0/float(rounds*len(instances))) # Compute the final training error: finalTrainingErrors = 0 finalTrainingCost = 0 for instance in instances: prediction = self.predict(instance) if instance.costs[prediction.label] > 0: finalTrainingErrors +=1 finalTrainingCost += instance.costs[prediction.label] finalTrainingErrorRate = float(finalTrainingErrors)/len(instances) print "Final training error rate=" + str(finalTrainingErrorRate) print "Final training cost=" + str(finalTrainingCost) return finalTrainingCost
#!/usr/bin/env python from _mycollections import mydefaultdict from mydouble import mydouble, counts d = mydefaultdict(mydouble) # always like that d["a"] = 1 # no need to say mydouble(1); transparent to the user print d print d.addc(d, 0.5) for i in xrange(500000): d[str(i)] = 2 print len(d) import gc e = d.copy() print "before", e["a"], counts() for i in xrange(20): # e = e.deepcopy() e.iaddc(d, 0.5) # e.addc(d, 0.5) print e["a"], counts() # gc.collect() # del e # ## gc.collect() # e = f
def test_instance_from_svm_input(line): featureVector = mydefaultdict(mydouble) featureDict = line.split() for featureID,featureVal in enumerate(featureDict): featureVector[featureID] = float(id(intern(featureVal))) return Instance(featureVector)
def __init__(self, train, dev, test, learning_rate=1.0, iteration=10, shuffle=False, method='extract', scratch=scratch_dir, max_data_size=10000, init_model=None): # model: extract/joint/jointdep self.method = method # load training data if train: # file ids self.train_problems = self.read_problems(train, max_data_size, True) log.write('Load %d training problems from %s\n' % (len(self.train_problems), train)) # load dev data if dev: self.dev_problems = self.read_problems(dev, max_data_size, False) log.write('Load %d dev problems from %s\n' % (len(self.dev_problems), dev)) # load test data if test: self.test_problems = self.read_problems(test, max_data_size, False) log.write('Load %d test problems from %s\n' % (len(self.test_problems), test)) self.test_task = test # learning params self.learning_rate = learning_rate self.shuffle = shuffle self.iteration = iteration # intermediate result self.scratch = scratch if not init_model: # initial weights: zero vectors #self.bigram_weights = mydefaultdict(mydouble) #self.bigram_weights['doc_ratio'] = 1 #self.edge_weights = mydefaultdict(mydouble) self.weights = mydefaultdict(mydouble) else: self.load_model(init_model) # initial ilp model t = time.clock() log.write('Building ILPs ... ') ilp_dir = '%s/ilps/%s' % (self.scratch, self.method) if not os.path.isdir(ilp_dir): os.makedirs(ilp_dir) if train: self.build_ilps(self.train_problems, self.weights, ilp_dir, maxlen[train[:3]]) if dev: self.build_ilps(self.dev_problems, self.weights, ilp_dir, maxlen[dev[:3]]) if test: self.build_ilps(self.test_problems, self.weights, ilp_dir, maxlen[test[:3]]) log.write('[%.2fs]\n' % (time.clock() - t)) # for averaged perceptron self.c = 1 # evaluate on dev if dev: summary_dir = '%s/summaries/%s' % (self.scratch, dev) if not os.path.isdir(summary_dir): os.makedirs(summary_dir) ref_dir = 'dat/%s/models' % dev if self.method == 'jointdep': self.evaluator = EvaluatorNgram(self.dev_problems) else: self.evaluator = EvaluatorRouge(ref_dir, summary_dir)
def _update_parameters(self, instance, prediction, averaging, adapt, param, averagedWeightVectors, updatesLeft): """ Update the weights and return the total number of errors. """ # first we need to get the score for the correct answer # if the instance has more than one correct answer then pick the min minCorrectLabelScore = float("inf") minCorrectLabel = None for label in instance.correctLabels: score = instance.featureVector.dot(self.currentWeightVectors[label]) if score < minCorrectLabelScore: minCorrectLabelScore = score minCorrectLabel = label # the loss is the scaled margin loss also used by Mejer and Crammer 2010 loss = prediction.score - minCorrectLabelScore + math.sqrt(instance.costs[prediction.label]) if adapt: # Calculate the confidence values # first for the predicted label zVectorPredicted = mydefaultdict(mydouble) zVectorMinCorrect = mydefaultdict(mydouble) for feature in instance.featureVector: # the variance is either some value that is in the dict or just 1 if feature in self.currentVarianceVectors[prediction.label]: zVectorPredicted[feature] = instance.featureVector[feature] * self.currentVarianceVectors[prediction.label][feature] else: zVectorPredicted[feature] = instance.featureVector[feature] # then for the minCorrect: if feature in self.currentVarianceVectors[minCorrectLabel]: zVectorMinCorrect[feature] = instance.featureVector[feature] * self.currentVarianceVectors[minCorrectLabel][feature] else: zVectorMinCorrect[feature] = instance.featureVector[feature] confidence = zVectorPredicted.dot(instance.featureVector) + zVectorMinCorrect.dot(instance.featureVector) beta = 1.0 / (confidence + param) alpha = loss * beta # update the current weight vectors self.currentWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha) self.currentWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha) if averaging: averagedWeightVectors[prediction.label].iaddc(zVectorPredicted, -alpha * updatesLeft) averagedWeightVectors[minCorrectLabel].iaddc(zVectorMinCorrect, alpha * updatesLeft) else: # the squared norm is twice the square of the features since they are the same per class norm = 2 * (instance.featureVector.dot(instance.featureVector)) factor = loss / (norm + 1.0 / (2 * param)) self.currentWeightVectors[prediction.label].iaddc(instance.featureVector, -factor) self.currentWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor) if averaging: averagedWeightVectors[prediction.label].iaddc(instance.featureVector, -factor * updatesLeft) averagedWeightVectors[minCorrectLabel].iaddc(instance.featureVector, factor * updatesLeft) if adapt: # update the diagonal covariance for feature in instance.featureVector.iterkeys(): # for the predicted if feature in self.currentVarianceVectors[prediction.label]: self.currentVarianceVectors[prediction.label][feature] -= beta * pow(zVectorPredicted[feature], 2) else: # Never updated this covariance before, add 1 self.currentVarianceVectors[prediction.label][feature] = 1 - beta * pow(zVectorPredicted[feature], 2) # for the minCorrect if feature in self.currentVarianceVectors[minCorrectLabel]: self.currentVarianceVectors[minCorrectLabel][feature] -= beta * pow(zVectorMinCorrect[feature], 2) else: # Never updated this covariance before, add 1 self.currentVarianceVectors[minCorrectLabel][feature] = 1 - beta * pow(zVectorMinCorrect[feature], 2)
def train(self, instances, averaging=True, shuffling=True, rounds=10, param=1): # we first need to go through the dataset to find how many classes # Initialize the weight vectors in the beginning of training" # we have one variance and one weight vector per class self.currentWeightVectors = {} self.currentVarianceVectors = {} if averaging: averagedWeightVectors = {} updatesLeft = rounds * len(instances) for label in instances[0].costs: self.currentWeightVectors[label] = mydefaultdict(mydouble) # remember: this is sparse in the sense that everething that doesn't have a value is 1 # everytime we to do something with it, remember to add 1 self.currentVarianceVectors[label] = {} # keep the averaged weight vector if averaging: averagedWeightVectors[label] = mydefaultdict(mydouble) # in each iteration for r in range(rounds): # shuffle if shuffling: random.shuffle(instances) errorsInRound = 0 costInRound = 0 # for each instance for instance in instances: prediction = self.predict(instance) # so if the prediction was incorrect # we are no longer large margin, since we are using the loss from the cost-sensitive PA if instance.costs[prediction.label] > 0: errorsInRound += 1 costInRound += instance.costs[prediction.label] # first we need to get the score for the correct answer # if the instance has more than one correct answer then pick the min minCorrectLabelScore = float("inf") minCorrectLabel = None for label in instance.correctLabels: score = instance.featureVector.dot( self.currentWeightVectors[label]) if score < minCorrectLabelScore: minCorrectLabelScore = score minCorrectLabel = label # Calculate the confidence values # first for the predicted label zVectorPredicted = mydefaultdict(mydouble) zVectorMinCorrect = mydefaultdict(mydouble) for feature in instance.featureVector: # the variance is either some value that is in the dict or just 1 if feature in self.currentVarianceVectors[ prediction.label]: zVectorPredicted[feature] = instance.featureVector[ feature] * self.currentVarianceVectors[ prediction.label][feature] else: zVectorPredicted[feature] = instance.featureVector[ feature] # then for the minCorrect: if feature in self.currentVarianceVectors[ minCorrectLabel]: zVectorMinCorrect[ feature] = instance.featureVector[ feature] * self.currentVarianceVectors[ minCorrectLabel][feature] else: zVectorMinCorrect[ feature] = instance.featureVector[feature] confidence = zVectorPredicted.dot( instance.featureVector) + zVectorMinCorrect.dot( instance.featureVector) beta = 1.0 / (confidence + param) # the loss is the scaled margin loss also used by Mejer and Crammer 2010 loss = prediction.score - minCorrectLabelScore + math.sqrt( instance.costs[prediction.label]) alpha = loss * beta # update the current weight vectors self.currentWeightVectors[prediction.label].iaddc( zVectorPredicted, -alpha) self.currentWeightVectors[minCorrectLabel].iaddc( zVectorMinCorrect, alpha) if averaging: averagedWeightVectors[prediction.label].iaddc( zVectorPredicted, -alpha * updatesLeft) averagedWeightVectors[minCorrectLabel].iaddc( zVectorMinCorrect, alpha * updatesLeft) # update the diagonal covariance for feature in instance.featureVector.iterkeys(): # for the predicted if feature in self.currentVarianceVectors[ prediction.label]: self.currentVarianceVectors[ prediction.label][feature] -= beta * pow( zVectorPredicted[feature], 2) else: # Never updated this covariance before, add 1 self.currentVarianceVectors[ prediction.label][feature] = 1 - beta * pow( zVectorPredicted[feature], 2) # for the minCorrect if feature in self.currentVarianceVectors[ minCorrectLabel]: self.currentVarianceVectors[minCorrectLabel][ feature] -= beta * pow( zVectorMinCorrect[feature], 2) else: # Never updated this covariance before, add 1 self.currentVarianceVectors[minCorrectLabel][ feature] = 1 - beta * pow( zVectorMinCorrect[feature], 2) if averaging: updatesLeft -= 1 print "Training error rate in round " + str(r) + " : " + str( float(errorsInRound) / len(instances)) if averaging: for label in self.currentWeightVectors: self.currentWeightVectors[label].iaddc( averagedWeightVectors[label], 1.0 / float(rounds * len(instances))) # Compute the final training error: finalTrainingErrors = 0 finalTrainingCost = 0 for instance in instances: prediction = self.predict(instance) if instance.costs[prediction.label] > 0: finalTrainingErrors += 1 finalTrainingCost += instance.costs[prediction.label] finalTrainingErrorRate = float(finalTrainingErrors) / len(instances) print "Final training error rate=" + str(finalTrainingErrorRate) print "Final training cost=" + str(finalTrainingCost) return finalTrainingCost
def _unpickle_mydict(s): return mydefaultdict(mydouble, (take2(x) for x in s.split()))