def _create_samples(self, trees): for tree in trees: tokens = get_leaves(tree) words = [normalize(token.word) for token in tokens] cats = [str(token.cat) for token in tokens] sent = " ".join(words) self.sents.append(sent) self.samples[sent] = cats
def _create_samples(self, trees): for tree in trees: tokens = get_leaves(tree) words = [token.word for token in tokens] cats = [token.cat.without_semantics for token in tokens] sent = " ".join(words) self.sents.append(sent) self.samples[sent] = " ".join(cats)
def _create_samples(self, trees): for tree in trees: tokens = get_leaves(tree) words = [token.word for token in tokens] cats = [token.cat.without_semantics for token in tokens] deps = self._get_dependencies(tree, len(tokens)) sent = " ".join(words) self.sents.append(sent) self.samples[sent] = cats, deps
def _create_samples(self, trees): for tree in trees: tokens = get_leaves(tree) words = [normalize(token.word) for token in tokens] tags = [token.tag for token in tokens] cats = [str(token.cat) for token in tokens] deps = self._get_dependencies(tree, len(tokens)) sent = " ".join(words) self.sents.append(sent) self.samples.append((sent, tags, (cats, deps)))
def _worker(inp): autofile, window_size = inp res = [] for tree in AutoReader(autofile).readall(suppress_error=True): leaves = get_leaves(tree) feats = map(feature_extract, [leaf.word for leaf in leaves]) contexts = get_context_by_window( feats, window_size, lpad=lpad, rpad=rpad) for leaf, context in zip(leaves, contexts): res.append(" ".join(map(lambda c: "|".join(c), context)) + \ " " + str(leaf.cat) + "\n") return res
def test(): sents = \ [line.strip().decode("utf-8") for line in open("test.ccgbank")] tree = JaCCGLineReader( "{< NP {(S\\NP){I2}_none test} {(S\\NP){I2}_none test}}".decode( "utf-8")).parse() for sent in sents: if len(sent) == 0: continue tree = JaCCGLineReader(sent).parse() if len(get_leaves(tree)) < 10: # print tree if not isinstance(tree, Leaf): tree.show_derivation()
def create_testdata(self, outdir): trees = JaCCGReader(self.filepath).readall() for tree in trees: tokens = get_leaves(tree) words = [token.word for token in tokens] self.sents.append(" ".join(words)) cats = [token.cat.without_semantics for token in tokens] samples = get_context_by_window(words, CONTEXT, lpad=LPAD, rpad=RPAD) assert len(samples) == len(cats) for cat, sample in zip(cats, samples): self.samples[" ".join(sample)] = cat with open(outdir + "/testdata.json", "w") as f: json.dump(self.samples, f) with open(outdir + "/testsents.txt", "w") as f: for sent in self.sents: f.write(sent.encode("utf-8") + "\n")
def create_traindata(self, outdir): trees = JaCCGReader(self.filepath).readall() # first construct dictionaries only for tree in trees: self._traverse(tree) # construct training samples with # categories whose frequency >= freq_cut. for tree in trees: tokens = get_leaves(tree) words = [token.word for token in tokens] self.sents.append(" ".join(words)) cats = [token.cat.without_semantics for token in tokens] samples = get_context_by_window(words, CONTEXT, lpad=LPAD, rpad=RPAD) assert len(samples) == len(cats) for cat, sample in zip(cats, samples): if self.cats[cat] >= self.cat_freq_cut: self.samples[" ".join(sample)] = cat self.cats = {k: v for (k, v) in self.cats.items() \ if v >= self.cat_freq_cut} self.words = {k: v for (k, v) in self.words.items() \ if v >= self.word_freq_cut} with open(outdir + "/unary_rules.txt", "w") as f: self._write(self.unary_rules, f, comment_out_value=True) with open(outdir + "/seen_rules.txt", "w") as f: self._write(self.seen_rules, f, comment_out_value=True) with open(outdir + "/target.txt", "w") as f: self._write(self.cats, f, comment_out_value=False) with open(outdir + "/words.txt", "w") as f: self._write(self.words, f, comment_out_value=False) with open(outdir + "/chars.txt", "w") as f: self._write(self.chars, f, comment_out_value=False) with open(outdir + "/traindata.json", "w") as f: json.dump(self.samples, f) with open(outdir + "/trainsents.txt", "w") as f: for sent in self.sents: f.write(sent.encode("utf-8") + "\n")