def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) A.add_many(f for token in x.sequence for f in token.attributes) # domains are now ready L.freeze() A.stop_growth() return (L, A)
def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) # add labels to label domain # extract features of the target path F = x.F path = x.truth A.add_many(F(0, None, path[0])) A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t-1], path[t])) # domains are now ready L.freeze() A.stop_growth() return (L, A)
def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) # add labels to label domain # extract features of the target path F = x.F path = x.truth A.add_many(F(0, None, path[0])) A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t - 1], path[t])) # domains are now ready L.freeze() A.stop_growth() return (L, A)
class PTB(object): "Load the POS-tagged Penn Treebank." def __init__(self, base, coarse=True): self.base = base self.coarse = coarse self.Y = Alphabet() # tag set self.V, self.V_freq = Alphabet(), {} # vocabulary self.V2Y, self.Y2V = dd(set), dd(set) self.train, self.dev, self.test = [], [], [] self.prefixes, self.suffixes = {}, {} self.prefix2int, self.suffix2int = {}, {} # Read data and create standard splits according to # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art) # # train split [0,18] for sectionid in range(19): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.train.append(sentence) for y, w in sentence: self.V.add(w) self.V2Y[w].add(self.Y.lookup(y)) self.Y2V[self.Y.lookup(y)].add(w) if w not in self.V_freq: self.V_freq[w] = 0 self.V_freq[w] += 1 for prefix in self.extract_prefixes(w): if prefix not in self.prefixes: self.prefixes[prefix] = 0 self.prefixes[prefix] += 1 for suffix in self.extract_suffixes(w): if suffix not in self.suffixes: self.suffixes[suffix] = 0 self.suffixes[suffix] += 1 # dev split [19,21] for sectionid in range(19, 22): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.dev.append(sentence) # test split [22,24] for sectionid in range(22, 25): #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.test.extend(self.read_section(sectionid)) self.Y.freeze() def extract_prefixes(self, w, n=10): """ gets prefixes up to length n """ prefixes = [] for i in range(1, min(len(w)+1, n+1)): segment = w[:i] if segment not in self.prefix2int: self.prefix2int[segment] = len(self.prefix2int) prefixes.append(w[:i]) return prefixes def extract_suffixes(self, w, n=10): """ gets suffixes up to lenght n """ suffixes = [] for i in range(1, min(len(w)+1, n+1)): segment = w[-i:] if segment not in self.suffix2int: self.suffix2int[segment] = len(self.suffix2int) suffixes.append(w[-i:]) return suffixes def tag_bigrams(self): """ extract all tag bigrams """ bigram2count = {} for sentence in self.train: for (tag1, _), (tag2, _) in zip(sentence, sentence[1:]): key = (tag1, tag2) if key not in bigram2count: bigram2count[key] = 0 bigram2count[key] += 1 return bigram2count def read_section(self, sectionid): "Read a section number `sectionid` from the PTB." root = os.path.join(self.base, str(sectionid).zfill(2)) for fname in os.listdir(root): if not fname.endswith('pos.gz'): continue with gzip.open(os.path.join(root, fname), 'rb') as f: for chunk in f.read().split('======================================'): if chunk.strip(): if self.coarse: # STUPID BIO ENCODING #yield [(self.Y["NNP"] if "NNP" in y else self.Y["OTHER"], w) for w, y in re_tagged.findall(chunk)] # Note: clean up punc reduction yield [(self.Y["PUNC"] if y in PUNC else self.Y[y[0]], w) for w, y in re_tagged.findall(chunk)] else: # TODO: what to do able bars in the tags? # FIND OUT AND CLEAN UP yield [(self.Y["PUNC"] if y in PUNC else self.Y[y.split("|")[0]] if "|" in y else self.Y[y], w) for w, y in re_tagged.findall(chunk)] def pp(self, sentence): "Pretty print." return ' '.join('%s/%s' % (w, self.Y.lookup(t)) for (t, w) in sentence)
class TransducerModel(object): """ Transducer model """ def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001): self.train = train self.dev = dev self.test = test self.Sigma = Sigma assert self.Sigma[""] == 0 self.IL = IL self.C = C self.L = L self.eta = eta # X and Y self.X, self.Y = Alphabet(), Alphabet() self.X.add(""); self.Y.add("") for s, si in self.Sigma.items(): if si == 0: continue self.X.add(s) for s, si in self.Sigma.items(): if si == 0: continue self.Y.add(s) self.X.freeze(); self.Y.freeze() # first order (possibly extend) self.P = Alphabet() self.P.add("") for s, si in self.Sigma.items(): if si == 0: continue self.P.add(s) self.P.add("oo") self.P.add("nn") self.P.add("yy") self.P.add("ss") self.P.add("ee") self.P.freeze() # create Z self.Z = Alphabet() self.Z[""] = 0 for p, pi in self.P.items(): for o, oi in self.Y.items(): z = p+o self.Z.add(z) self.Z.freeze() # model self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL) self.features = TransducerFeatures(self.X, self.Y, self.P) self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4) self.updater.w[0] = 10.0 self.updater.w[1] = -10.0 def optimize(self, iterations=10, start=0): """ optimize the model """ #np.random.shuffle(self.train) for i in xrange(iterations): for instance in iterview(self.train, colored('Pass %s' % (i+1+start), 'blue')): psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) dpsi = zeros_like(psi) x, y = instance.sr, instance.ur #print "LL", self.model.ll(x, y, psi, minx=MINX, miny=MINY) dpsi = self.model.dll(x, y, psi, minx=MINX, miny=MINY) self.features.update(instance, dpsi, self.updater) self.updater.step += 1 def step_is(self, tree, strings, weights, eta=0.0): """ optimize the model """ self.updater.eta = eta psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = zeros_like(psi) dpsi = self.model.dll_is(tree.sr, tree.ur, strings, weights, psi, minx=MINX, miny=MINY) self.features.update(tree, dpsi, self.updater) self.updater.step += 1 def sample(self, data, num=1000): """ sample """ samples = [] inside = 0 correct1, correct2, total = 0, 0, 0 for instance in iterview(data, colored('Sampling', 'green')): psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) sr = instance.sr dist = {} for s in self.model.sample(sr, psi, num=num): output = "" for x, y in s: output += y if output not in dist: dist[output] = 0 dist[output] += 1 count = dist[instance.ur_gold] if instance.ur_gold in dist else 0 decoded = self.decode(instance)[1] if decoded != instance.ur_gold and count > 0: inside += 1 if decoded == instance.ur_gold: correct1 += 1 if instance.ur_gold in dist: correct2 += 1 total += 1 samples.append(dist) # TODO: put into log #print ; print inside #print correct1 / total, correct2 / total return samples def decode(self, instance): """ Decodes an instance """ psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) return results def evaluate(self, data, maximum=100000000): """ decode the model """ correct, total = 0, 0 counter = 0 for instance in iterview(data, colored('Decoding', 'red')): if counter == maximum: break psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY) score, ur2 = results[0], results[1] if ur1 == ur2: correct += 1 print ur1, ur2 total += 1 counter += 1 print return float(correct) / total def ll(self, tree, ur): """ gets the log-likelihood """ psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)