def preprocess_bubs_format(bubs, output): """Convert grammar from bubs-parser into ldp-friendly csv format. The result is an equivalent grammar, which is much faster to load because it has been integerized. Given a gzipped grammar from bubs-parser, e.g. `eng.M2.gr.gz`, this function will generate four files: - eng.M2.gr.csv: grammar rules - eng.M2.lex.csv: lexical rules - eng.M2.lex.alphabet: mapping from terminals to integers - eng.M2.sym.alphabet: mapping from syms to integers """ sym = Alphabet() lex = Alphabet() import gzip lines = gzip.open(bubs, 'rb').readlines() reading_lex = False l = [] f = [] for line in iterview(lines[1:]): # drop first line if line.startswith('===== LEXICON'): reading_lex = True continue x = line.strip().split() if not x: continue lhs = x[0] rhs = tuple(b for b in x[2:-1]) score = x[-1] if len(rhs) == 1: rhs = (rhs[0], '') y, z = rhs lhs = sym[lhs] y = lex[y] if reading_lex else sym[y] z = sym[z] if z else -1 if reading_lex: l.append({'score': score, 'head': lhs, 'left': y}) else: f.append({'score': score, 'head': lhs, 'left': y, 'right': z}) # non-gzipped loads faster. #DataFrame(f).to_csv(gzip.open(output + '.gr.csv.gz', 'wb')) #DataFrame(l).to_csv(gzip.open(output + '.lex.csv.gz', 'wb')) DataFrame(f).to_csv(output + '.gr.csv') DataFrame(l).to_csv(output + '.lex.csv') sym.save(output + '.sym.alphabet') lex.save(output + '.lex.alphabet')
def preprocess_berkeley_format(input_prefix, output, coarsen=False): """ Preprocessing: convert PTB grammar into simple tsv format. """ def g(x): if coarsen: x = x.split('^')[0] x = x.split('_')[0] return x sym = Alphabet() lex = Alphabet() lexical_rules = [] for x in file(input_prefix + '.lexicon'): [(x, y, s)] = re.findall(r'(\S+)\s+(\S+)\s*\[(.*?)\]', x) s = float(s) x = g(x) y = g(y) lexical_rules.append({'score': log(s), 'head': sym[x], 'left': lex[y]}) rules = [] for x in file(input_prefix + '.grammar'): x, y = x.split(' -> ') y = y.split() if len(y) == 2: y, s = y s = float(s) z = -1 else: assert len(y) == 3 y, z, s = y s = float(s) x = g(x) y = g(y) if x == y and z == -1: continue x = sym[x] y = sym[y] if z != -1: z = g(z) z = sym[z] rules.append({'score': log(s), 'head': x, 'left': y, 'right': z}) DataFrame(rules).to_csv(output + '.gr.csv') DataFrame(lexical_rules).to_csv(output + '.lex.csv') sym.save(output + '.sym.alphabet') lex.save(output + '.lex.alphabet')
class Segmenter(object): """ Segmenter """ def __init__(self, train, dev, test, decode_type, split_num, log_fname=None, segmenter_type='tree', G=3, weights='weights', alphabet=None, T_L=2, T_eta=1.0, T_C=0.0000001, S_L=2, S_eta=1.0, S_C=0.00000001): # set up the logging system log = logging.getLogger('') log.setLevel(logging.INFO) format = logging.Formatter("%(asctime)s<>%(levelname)s<>%(message)s") ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = logging.handlers.RotatingFileHandler(log_fname, maxBytes=(1048576 * 5), backupCount=7) fh.setFormatter(format) log.addHandler(fh) self.G = G self.split_num = split_num self.segmenter_type = segmenter_type self.decode_type = decode_type self.S_L, self.S_eta, self.S_C = S_L, S_eta, S_C self.T_L, self.T_eta, self.T_C = T_L, T_eta, T_C self.sig = "split={0},L={1},C={2},eta={3},type={4}".format( *(self.split_num, self.T_L, self.T_C, self.T_eta, self.decode_type)) logging.info("Transducer Regularizer Type: L={0}".format(T_L)) logging.info("Transducer Regularizer Coefficient: C={0}".format(T_C)) logging.info("Transducer Learning Rate: eta={0}".format(T_eta)) logging.info("Segmenter Regularizer Type: L={0}".format(S_L)) logging.info("Segmenter Regularizer Coefficient: C={0}".format(S_C)) logging.info("Segmenter Learning Rate: eta={0}".format(S_eta)) self.weights = weights self.Sigma = Alphabet() self.Sigma.add("") # add epsilon at 0 self.Sigma.add("o") self.Sigma.add("n") self.Sigma.add("y") self.Sigma.add("s") self.Sigma.add("e") if alphabet is not None: self.Sigma = self.Sigma.load(alphabet) # processs the data # TODO: modularize self.train = self.process(train, 100000) self.dev = self.process(dev, 1000) self.test = self.process(test, 1000) # dump the alphabet self.Sigma.save("alphabets/sigma-{0}.alphabet".format(self.split_num)) # create model self.segmenter = None logging.info("Segmenter Type: {0}".format(self.segmenter_type)) logging.info("Decoder Type: {0}".format(self.decode_type)) if self.segmenter_type == TREE: self.segmenter = TreeSegmenter(G) self.features = TreeFeatures(self.G, 1000) elif self.segmenter_type == CHUNK: self.segmenter = ChunkSegmenter(G) self.features = ChunkFeatures(self.G, 1000) else: raise Exception('Illicit Model Type') # transducer self.transducer = TransducerModel(self.train, self.dev, self.test, self.Sigma, L=self.T_L, eta=self.T_eta, C=self.T_C) # extract features self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') # dimension of data self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=2, C=self.S_C, eta=0.1, fudge=1e-4) self.updater.w[0] = 10 self.updater.w[1] = 10 def save_transducer(self, directory, i): """ save the transducer weights """ np.save(directory + "/transducer-{0}-{1}.npy".format(*(self.sig, i)), array(self.transducer.updater.w)) def save_segmenter(self, directory, i): np.save(directory + "/segmenter-{0}-{1}.npy".format(*(self.sig, i)), array(self.updater.w)) def save(self, directory): self.save_transducer(directory, 'final') self.save_segmenter(directory, 'final') def optimize(self, t=None, load=False, transducer=None, segmenter=None, iterations=20): """ optimize the model """ if load: assert transducer is not None #assert segmenter is not None self.load_weights(transducer, segmenter) if t is None: return elif t == JOINT: self.optimize_joint(iterations) elif t == PIPE: self.optimize_pipeline(iterations) def load_weights(self, transducer, segmenter): """ load weights """ self.transducer.updater.w = np.load(transducer) self.updater.w = np.load(segmenter) def optimize_pipeline(self, iterations=10): """ optimize """ for i in xrange(iterations): self.transducer.optimize(1, i) train_acc = self.transducer.evaluate(self.train) dev_acc = self.transducer.evaluate(self.dev) test_acc = self.transducer.evaluate(self.test) logging.info( "transducer epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info( "transducer epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info( "transducer epoch {0} test acc: {1}".format(*(i, test_acc))) self.save_transducer(self.weights, i) print self.transducer.evaluate(self.dev) if self.segmenter_type == TREE: self.optimize_tree(iterations) elif self.segmenter_type == CHUNK: self.optimize_chunk(iterations) def optimize_chunk(self, iterations): """ optimize the model """ for i in xrange(iterations): for tree in iterview(self.train, colored('Pass %s' % (i + 1), 'blue')): gt0, gt, ge = self.features.potentials_catchup( tree, self.updater.w, self.updater) dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like( ge) self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt) self.features.update(tree, dge, dgt0, dgt, self.updater) self.updater.step += 1 self.save_segmenter(self.weights, i) self.decode(self.train, VITERBI) self.decode(self.dev, VITERBI) test_acc, test_f1 = self.decode(self.test, VITERBI) logging.info("chunk epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info("chunk epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info("chunk epoch {0} test acc: {1}".format(*(i, test_acc))) logging.info("chunk epoch {0} train f1: {1}".format(*(i, train_f1))) logging.info("chunk epoch {0} dev f1: {1}".format(*(i, dev_f1))) logging.info("chunk epoch {0} test f1: {1}".format(*(i, dev_f1))) def optimize_tree(self, iterations): """ optimize the model """ for i in xrange(iterations): for tree in iterview(self.train, colored('Pass %s' % (i + 1), 'blue')): psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = self.segmenter.dll(tree, psi) self.features.update(tree, dpsi, self.updater) self.updater.step += 1 self.save_segmenter(self.weights, i) self.decode(self.train, VITERBI) self.decode(self.dev, VITERBI) test_acc, test_f1 = self.decode(self.test, VITERBI) logging.info("tree epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info("tree epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info("tree epoch {0} test acc: {1}".format(*(i, test_acc))) logging.info("tree epoch {0} train f1: {1}".format(*(i, train_f1))) logging.info("tree epoch {0} dev f1: {1}".format(*(i, dev_f1))) logging.info("tree epoch {0} test f1: {1}".format(*(i, test_f1))) def optimize_joint(self, iterations, num_samples=10, eta1=0.0, eta2=0.0): """ optimize jointly using importance sampling """ # TODO: unit test self.updater.eta = eta1 for i in xrange(iterations): samples = self.transducer.sample(self.train, num=num_samples) for tree, sample in iterview(zip(self.train, samples), colored('Pass %s' % (i + 1), 'blue')): # compute approximate partition function logZ = NINF strings, weights = [], [] for (ur, count) in sample.items(): score = self.transducer.ll(tree, ur) if self.segmenter_type == CHUNK: score += self.score_chunk(tree, ur) elif self.segmenter_type == TREE: score += self.score_tree(tree, ur) # TODO: double check logZ = logaddexp(logZ, score) weights.append(score) strings.append(ur) #TODO: make more elegant tmp = [] for weight in weights: tmp.append(weight - logZ) # TODO: double check weights = tmp # take a tranducer weight gradient step with the importance sampling self.transducer.step_is(tree, strings, weights, eta=eta2) # take a segmenter weight gradient step with the importance sampling for ur, weight in zip(sample, weights): if self.segmenter_type == CHUNK: self.is_chunk(tree, ur, weight) elif self.segmenter_type == TREE: self.is_tree(tree, ur, weight) self.updater.step += 1 def is_chunk(self, tree, ur, weight): """ importance sampling gradient step tree """ tree.update_ur(ur) self.features.featurize_instance(tree) gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(ge) self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt) dgt0 *= weight dgt *= weight dge *= weight self.features.update(tree, dge, dgt0, dgt, self.updater) def is_tree(self, tree, ur, weight): """ importance sampling gradient step chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = self.segmenter.dll(tree, psi) dpsi *= weight self.features.update(tree, dpsi, self.updater) def baseline_ur(self, data): """ baseline ur """ for tree in iterview(data, colored('Updating Baseline UR', 'red')): tree.ur_samples = [] tree.ur_samples.append(tree.sr) def decode_ur(self, data): """ decodes the UR """ for tree in iterview(data, colored('Updating Viterbi UR', 'red')): tree.ur_samples = [] viterbi_ur = self.transducer.decode(tree)[1] tree.ur_samples.append(viterbi_ur) def oracle_ur(self, data): """ uses the oracle UR """ for tree in iterview(data, colored('Updating Oracle UR', 'red')): tree.ur_samples = [] tree.ur_samples.append(tree.ur_gold) def sample_ur(self, data, num_samples=1000): """ samples the UR """ samples = self.transducer.sample(data, num=num_samples) for tree, samples in iterview(zip(data, samples), colored('Sampling', 'red')): tree.ur_samples = [] viterbi_ur = self.transducer.decode(tree)[1] tree.ur_samples.append(viterbi_ur) for sample, count in samples.items(): tree.ur_samples.append(sample) def decode_chunk(self, tree, ur): """ decodes a chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) best, segments, labels = self.segmenter.decode(tree, ge, gt0, gt) truth = [tree.ur[i:j] for i, j in tree.indices] guess = [tree.ur[i:j] for i, j in segments] return truth, guess def decode_tree(self, tree, ur): """ decodes a tree """ tree.update_ur(ur) self.features.featurize_instance(tree) psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) max_score, tree_string, max_spans = self.segmenter.argmax( tree.M, self.G, psi, tree.ur) gold_spans = set(tree.spans) guess_spans = set(tree.spans) p, r = 0.0, 0.0 for span in gold_spans: if span in guess_spans: p += 1.0 p /= len(gold_spans) for span in guess_spans: if span in gold_spans: r += 1.0 r /= len(guess_spans) f1 = (2 * p * r) / (p + r) # TODO: horrible hack segmentation = tree_string.replace("(", "").replace(")", "").replace(" ", "") for i in xrange(100): segmentation = segmentation.replace(str(i), "") segmentation = segmentation.split(":") guess = segmentation[:-1] truth = [x[0] for x in to_segmentation(tree.root)] return truth, guess, f1 def score_chunk(self, tree, ur): """ scores a chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) M = tree.M gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.segmenter.logZ(ge, gt0, gt, M) def score_tree(self, tree, ur): """ scores a tree """ tree.update_ur(ur) self.features.featurize_instance(tree) M = tree.M psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.segmenter.logZ(M, self.G, psi) def decode(self, data, data_type, decode_type=None, sep=u"#"): """ decode the chunker """ if decode_type is None: decode_type = self.decode_type if decode_type == ORACLE: self.oracle_ur(data) elif decode_type == BASELINE: self.baseline_ur(data) elif decode_type == VITERBI: self.decode_ur(data) elif decode_type == SAMPLE: self.sample_ur(data) else: raise Exception('Illicit Decode Type') ur_correct, ur_total = 0, 0 correct, f1, tree_f1, lev, total = 0, 0, 0, 0, 0 for tree in iterview(data, colored('Decoding', 'red')): max_ur, max_score = None, NINF counter = 0 for ur in tree.ur_samples: tree.update_ur(ur) counter += 1 score = 0.0 score = self.transducer.ll(tree, ur) #print #print "LL", self.transducer.ll(tree, ur) if self.segmenter_type == CHUNK: score += self.score_chunk(tree, ur) #print "SCORE", self.score_chunk(tree, ur) #print ur #raw_input() elif self.segmenter_type == TREE: score += self.score_tree(tree, ur) # take the best importance sample if score >= max_score: max_score = score max_ur = ur #print "counter", counter if max_ur == tree.ur_gold: ur_correct += 1 ur_total += 1 truth, guess, tree_f1_tmp = None, None, None if self.segmenter_type == CHUNK: truth, guess = self.decode_chunk(tree, max_ur) elif self.segmenter_type == TREE: truth, guess, tree_f1_tmp = self.decode_tree(tree, max_ur) tree_f1 += tree_f1_tmp # ACCURACY if truth == guess: correct += 1 # LEVENSHTEIN lev += Levenshtein.distance(sep.join(truth), sep.join(guess)) # F1 set1, set2 = set(guess), set(truth) p, r = 0, 0 for e in set1: if e in set2: p += 1 for e in set2: if e in set1: r += 1 p /= len(set1) r /= len(set2) if p + r > 0: f1 += 2 * p * r / (p + r) total += 1 logging.info("decoder type: {0}".format(decode_type)) logging.info("{0} ur acc: {1}".format(*(data_type, ur_correct / total))) logging.info("{0} seg acc: {1}".format(*(data_type, correct / total))) logging.info("{0} f1: {1}".format(*(data_type, f1 / total))) logging.info("{0} edit: {1}".format(*(data_type, lev / total))) if self.segmenter_type == TREE: logging.info("{0} tree f1: {1}".format(*(data_type, tree_f1 / total))) def process(self, fname, maximum=100): """ Put the string data into the data structures necessary for training and decoding the model. """ processed = [] data = Data(fname) for counter, (sr, (tree, (indices, index_labels))) in enumerate(data): if counter == maximum: break ur = to_string(tree) for s in list(sr): self.Sigma.add(s) for s in list(ur): self.Sigma.add(s) spans, labels = [], [] for node in walk(tree): spans.append((node.i, node.j)) labels.append(node.label) t = Tree(self.G, sr, ur, spans, labels, indices, index_labels, len(spans), tree) processed.append(t) return processed