コード例 #1
0
ファイル: data.py プロジェクト: luisandresilva/depccg
    def create_traindata(args):
        self = TrainingDataCreator(args.PATH, args.word_freq_cut,
                                   args.char_freq_cut, args.cat_freq_cut)

        trees = [tree for _, _, tree in read_ccgbank(self.filepath)]
        for tree in trees:
            self._traverse(tree)
        self._create_samples(trees)

        cats = {k: v for k, v in self.cats.items() if v >= self.cat_freq_cut}
        self._write(cats, args.OUT / 'target.txt')

        words = {
            k: v
            for k, v in self.words.items() if v >= self.word_freq_cut
        }
        self._write(words, args.OUT / 'words.txt')

        chars = {
            k: v
            for k, v in self.chars.items() if v >= self.char_freq_cut
        }
        self._write(chars, args.OUT / 'chars.txt')

        seen_rules = {
            f'{c1} {c2}': v
            for (c1, c2), v in self.seen_rules.items()
            if c1 in cats and c2 in cats
        }
        self._write(seen_rules, args.OUT / 'seen_rules.txt')

        unary_rules = {
            f'{c1} {c2}': v
            for (c1, c2), v in self.unary_rules.items()
        }
        self._write(unary_rules, args.OUT / 'unary_rules.txt')

        with open(args.OUT / 'traindata.json', 'w') as f:
            logger.info(f'writing to {f.name}')
            json.dump(self.samples, f)

        with open(args.OUT / 'trainsents.txt', 'w') as f:
            logger.info(f'writing to {f.name}')
            for sent in self.sents:
                print(sent, file=f)

        with open(args.OUT / 'trainsents.conll', 'w') as f:
            logger.info(f'writing to {f.name}')
            self._to_conll(f)
コード例 #2
0
ファイル: data.py プロジェクト: luisandresilva/depccg
    def create_testdata(args):
        self = TrainingDataCreator(args.PATH, args.word_freq_cut,
                                   args.cat_freq_cut, args.char_freq_cut)

        trees = [tree for _, _, tree in read_ccgbank(self.filepath)]
        self._create_samples(trees)

        with open(args.OUT / 'testdata.json', 'w') as f:
            logger.info(f'writing to {f.name}')
            json.dump(self.samples, f)

        with open(args.OUT / 'testsents.txt', 'w') as f:
            logger.info(f'writing to {f.name}')
            for sent in self.sents:
                print(sent, file=f)

        with open(args.OUT / 'testsents.conll', 'w') as f:
            logger.info(f'writing to {f.name}')
            self._to_conll(f)
コード例 #3
0
ファイル: data.py プロジェクト: luisandresilva/depccg
 def convert_json(autopath):
     self = TrainingDataCreator(autopath, None, None, None)
     trees = [tree for _, _, tree in read_ccgbank(self.filepath)]
     logger.info(f'loaded {len(trees)} trees')
     self._create_samples(trees)
     return self.samples