def write(self, filename="-", weights=None): if weights is None: weights = self.weights if filename == "-": outfile = sys.stdout filename = "STDOUT" # careful overriding else: outfile = open(filename, "wt") self.print_templates(outfile) mytime = Mytime() nonzero = 0 print >> logs, "sorting %d features..." % len(weights), for i, f in enumerate(sorted(weights), 1): if i == 1: # sorting done print >> logs, "done in %.2lf seconds." % mytime.period() print >> logs, "writing features to %s..." % filename v = weights[f] if math.fabs(v) > 1e-3: print >> outfile, "%s\t%.5lf" % (f, v) nonzero += 1 if self.unk > 0: # print known words print >> outfile, " " + " ".join(sorted( self.knowns)) # " " to mark print >> logs, "%d nonzero feature instances written in %.2lf seconds." % \ (nonzero, mytime.period()) ## nonzero != i
def read_weights(self, filename, infertemplates=False): '''instances are like "s0t-q0t=LRB-</s>=>LEFT 3.8234"''' infile = self.read_templates(filename) infertemplates = len(self.templates) < 1 if infertemplates: print >> logs, "will infer templates from weights..." mytime = Mytime() i = 0 if infile is not None: print >> logs, "reading feature weights from %s\t" % filename, for i, line in enumerate(infile, 1): if i % 200000 == 0: print >> logs, "%d lines read..." % i, if line[0] == " ": # TODO: separate known words line (last line) self.knowns = set(line.split()) print >> logs, "\n%d known words read." % len(self.knowns) self.unk = 1 # in cae you forgot to say it; doesn't matter 1 or x break feat, weight = line.split() self.weights[feat] = float(weight) if infertemplates: self.add_template(feat.split("=", 1)[0], 1) ## one occurrence print >> logs, "\n%d feature instances (%d lines) read in %.2lf seconds." % \ (len(self.weights), i, mytime.period()) self.print_autoevals()
def write(self, filename="-", weights=None): if weights is None: weights = self.weights if filename == "-": outfile = sys.stdout filename = "STDOUT" # careful overriding else: outfile = open(filename, "wt") self.print_templates(outfile) mytime = Mytime() nonzero = 0 # my wvector for action, feats in weights.iteritems(): for f in sorted(feats): v = feats[f] if math.fabs(float(v)) > 1e-3: # TODO tid, feat = f.split("=", 1) print >> outfile, "%s=%s=>%s\t%.5lf" % ( self.list_templates[int(tid)][0], feat, action, float(v)) nonzero += 1 print >> logs, "%d nonzero feature instances written in %.2lf seconds." % \ (nonzero, mytime.period()) ## nonzero != i
def read_weights(self, filename, infertemplates=False): '''instances are like "s0t-q0t=LRB-</s>=>LEFT 3.8234"''' infile = self.read_templates(filename) infertemplates = len(self.templates) <= 1 if infertemplates: print >> logs, "will infer templates from weights..." mytime = Mytime() i = 0 if infile is not None: print >> logs, "reading feature weights from %s\t" % filename, for i, line in enumerate(infile, 1): if i % 200000 == 0: print >> logs, "%d lines read..." % i, feat, weight = line.split() weight = WVector.value_class( float(weight)) # in case of mydouble if FLAGS.use_template_id: template, instance = feat.split("=", 1) tid = self.templates[template] feat = "%d=%s" % (tid, instance) if Model.doublehash == 1: if FLAGS.tuplefeats: f, action = instance.rsplit("=>", 1) #action = Model.mapnames[action] fs = tuple(f.split("|")) if FLAGS.integerize: fs = tuple(map(Vocab.str2id, fs)) self.weights[action][(tid, ) + fs] = weight else: f, action = feat.rsplit("=>", 1) #action = Model.mapnames[action] self.weights[action][f] = weight elif Model.doublehash == 2: f, action = instance.rsplit("=>", 1) action = Model.mapnames[action] self.weights[action][tid][f] = weight else: self.weights[feat] = weight if infertemplates: self.add_template(feat.split("=", 1)[0], 1) ## one occurrence print >> logs, "\n%d feature instances (%d lines) read in %.2lf seconds." % \ (len(self.weights), i, mytime.period()) self.print_autoevals()
yield raw_input() except: return logs = sys.stderr from collections import defaultdict from svector import Vector from model import Model from deptree import DepTree, DepVal from mytime import Mytime mytime = Mytime() import gflags as flags FLAGS = flags.FLAGS flags.DEFINE_integer("beam", 1, "beam width", short_name="b") flags.DEFINE_integer( "leftbeam", 1000, "leftptrs beam width" ) # number of left items (predictors to be combined w/ current) flags.DEFINE_integer("kbest", 0, "kbest", short_name="k") flags.DEFINE_boolean("forest", False, "dump the forest") flags.DEFINE_boolean("earlystop", False, "try early stop (compared with gold seq)") flags.DEFINE_integer("debuglevel", 0,
def train(self): start_mem = memory() starttime = time.time() print >> logs, "starting perceptron at", time.ctime() best_prec = 0 for it in xrange(1, self.iter + 1): print >> logs, "iteration %d starts..............%s" % ( it, time.ctime()) curr_mem = memory() iterstarttime = time.time() self.decoder.num_edges = 0 num_updates, early_updates, num_steps = self.one_pass_on_train() iterendtime = time.time() print >> logs, "memory usage at iter %d: extra %s, total %s" % ( it, human(memory(curr_mem)), human(memory(start_mem))) if FLAGS.debuglevel >= 1: print >> logs, "weights=", self.weights curr_mem = memory() print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % ( it, time.ctime()) ## avgweights = self.avg_weights() if self.avg else self.weights avgtime = 0 timer = Mytime() if self.avg: ## print >> logs, " w=", self.weights ## print >> logs, " ".join(map(str, [x.get_step() for x in self.weights.values()])) self.weights.set_avg(self.c) avgtime += timer.gap() if FLAGS.debuglevel >= 1: print >> logs, "avgweights=", self.weights prec = self.eval_on_dev() print >> logs, "eval on dev took %.1f seconds." % timer.gap() print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}{7}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h, root={10:.1%}"\ .format(it, num_updates, prec, len(self.weights), early_updates, (time.time() - iterstarttime)/3600, (time.time() - starttime)/3600., "+" if prec > best_prec else "", num_steps, self.decoder.num_edges, prec.root()) logs.flush() if prec > best_prec: best_prec = prec best_it = it best_wlen = len(self.weights) best_time = time.time() - starttime print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format( it, prec) if not FLAGS.dump_last: self.dump(self.weights) else: self.bestweights = self.weights.deepcopy() if self.avg: timer = Mytime() self.weights.reset_avg(self.c) # restore weights t = timer.gap() print >> logs, "avg weights (set/reset) took %.1f+%.1f=%.1f seconds." % ( avgtime, t, avgtime + t) ## self.decoder.model.weights = self.weights # restore non-avg ## del avgweights gc.collect() if FLAGS.mydouble: from mydouble import counts print >> logs, "mydouble usage and freed: %d %d" % counts() print >> logs, "peaked at iteration {0}: {1} ({3:.1f}h), |bestw|= {2}.".format( best_it, best_prec, best_wlen, best_time / 3600) print >> logs, best_prec.details() print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)" % \ (it, time.ctime(), (time.time() - starttime)/3600.) if FLAGS.dump_last: self.dump(self.bestweights)
def read_weights_and_insert_different_noise(self, filename, noise_info, infertemplates=False): # ADDED CODE HERE '''instances are like "s0t-q0t=LRB-</s>=>LEFT 3.8234"''' """ noise_info = { 'method': FLAGS.noise_method, 'mu': FLAGS.mu, 'sigma': FLAGS.sigma, 'noise_file_path': FLAGS.noise_file_path, } """ method = noise_info['method'] mu = noise_info['mu'] sigma = noise_info['sigma'] noise_file_path = noise_info['noise_file_path'] noises_vector = numpy.load(noise_file_path) infile = self.read_templates(filename) infertemplates = len(self.templates) <= 1 if infertemplates: print >> logs, "will infer templates from weights..." mytime = Mytime() i = 0 if infile is not None: print >> logs, "reading feature weights from %s\t" % filename, # for i, line in enumerate(infile, 1): for i, (line, noises_vector_coeff) in enumerate( zip(infile, noises_vector), 1): if i % 200000 == 0: print >> logs, "%d lines read..." % i, feat, weight = line.split() weight = float( weight ) # WVector.value_class(float(weight)) # in case of mydouble noise_ = float( noises_vector_coeff) * numpy.random.randn() + float(mu) if method == 'a': weight = weight + noise_ elif method == 'm': weight = weight * noise_ if FLAGS.use_template_id: template, instance = feat.split("=", 1) tid = self.templates[template] feat = "%d=%s" % (tid, instance) if Model.doublehash == 1: if FLAGS.tuplefeats: f, action = instance.rsplit("=>", 1) # action = Model.mapnames[action] fs = tuple(f.split("|")) self.weights[action][(tid, ) + fs] = weight else: f, action = feat.rsplit("=>", 1) # action = Model.mapnames[action] self.weights[action][f] = weight elif Model.doublehash == 2: f, action = instance.rsplit("=>", 1) action = Model.mapnames[action] self.weights[action][tid][f] = weight else: self.weights[feat] = weight if infertemplates: self.add_template(feat.split("=", 1)[0], 1) ## one occurrence print >> logs, "\n%d feature instances (%d lines) read in %.2lf seconds." % \ (len(self.weights), i, mytime.period()) self.print_autoevals()