def create_traindata(args): self = TrainingDataCreator(args.PATH, args.word_freq_cut, args.char_freq_cut, args.cat_freq_cut) trees = [tree for _, _, tree in read_ccgbank(self.filepath)] for tree in trees: self._traverse(tree) self._create_samples(trees) cats = {k: v for k, v in self.cats.items() if v >= self.cat_freq_cut} self._write(cats, args.OUT / 'target.txt') words = { k: v for k, v in self.words.items() if v >= self.word_freq_cut } self._write(words, args.OUT / 'words.txt') chars = { k: v for k, v in self.chars.items() if v >= self.char_freq_cut } self._write(chars, args.OUT / 'chars.txt') seen_rules = { f'{c1} {c2}': v for (c1, c2), v in self.seen_rules.items() if c1 in cats and c2 in cats } self._write(seen_rules, args.OUT / 'seen_rules.txt') unary_rules = { f'{c1} {c2}': v for (c1, c2), v in self.unary_rules.items() } self._write(unary_rules, args.OUT / 'unary_rules.txt') with open(args.OUT / 'traindata.json', 'w') as f: logger.info(f'writing to {f.name}') json.dump(self.samples, f) with open(args.OUT / 'trainsents.txt', 'w') as f: logger.info(f'writing to {f.name}') for sent in self.sents: print(sent, file=f) with open(args.OUT / 'trainsents.conll', 'w') as f: logger.info(f'writing to {f.name}') self._to_conll(f)
def create_testdata(args): self = TrainingDataCreator(args.PATH, args.word_freq_cut, args.cat_freq_cut, args.char_freq_cut) trees = [tree for _, _, tree in read_ccgbank(self.filepath)] self._create_samples(trees) with open(args.OUT / 'testdata.json', 'w') as f: logger.info(f'writing to {f.name}') json.dump(self.samples, f) with open(args.OUT / 'testsents.txt', 'w') as f: logger.info(f'writing to {f.name}') for sent in self.sents: print(sent, file=f) with open(args.OUT / 'testsents.conll', 'w') as f: logger.info(f'writing to {f.name}') self._to_conll(f)
def convert_json(autopath): self = TrainingDataCreator(autopath, None, None, None) trees = [tree for _, _, tree in read_ccgbank(self.filepath)] logger.info(f'loaded {len(trees)} trees') self._create_samples(trees) return self.samples