Ejemplo n.º 1
0
    def create_traindata(args):
        self = TrainingDataCreator(args.PATH, args.word_freq_cut,
                                   args.cat_freq_cut, args.afix_freq_cut)

        trees = [
            tree for _, _, tree in read_auto(self.filepath)
            if tree.word != 'FAILED'
        ]
        logger.info(f'loaded {len(trees)} trees')
        for tree in trees:
            self._traverse(tree)
        self._create_samples(trees)

        cats = {k: v for k, v in self.cats.items() if v >= self.cat_freq_cut}
        self._write(cats, args.OUT / 'target.txt')

        words = {
            k: v
            for k, v in self.words.items() if v >= self.word_freq_cut
        }
        self._write(words, args.OUT / 'words.txt')

        suffixes = {
            k: v
            for k, v in self.suffixes.items() if v >= self.afix_freq_cut
        }
        self._write(suffixes, args.OUT / 'suffixes.txt')

        prefixes = {
            k: v
            for k, v in self.prefixes.items() if v >= self.afix_freq_cut
        }
        self._write(prefixes, args.OUT / 'prefixes.txt')

        seen_rules = {
            f'{c1} {c2}': v
            for (c1, c2), v in self.seen_rules.items()
            if c1 in cats and c2 in cats
        }
        self._write(seen_rules, args.OUT / 'seen_rules.txt')

        unary_rules = {
            f'{c1} {c2}': v
            for (c1, c2), v in self.unary_rules.items()
        }

        self._write(unary_rules, args.OUT / 'unary_rules.txt')

        with open(args.OUT / 'traindata.json', 'w') as f:
            logger.info(f'writing to {f.name}')
            json.dump(self.samples, f)

        with open(args.OUT / 'trainsents.txt', 'w') as f:
            logger.info(f'writing to {f.name}')
            for sent in self.sents:
                print(sent, file=f)

        with open(args.OUT / 'trainsents.conll', 'w') as f:
            logger.info(f'writing to {f.name}')
            self._to_conll(f)
Ejemplo n.º 2
0
def get_deps_from_auto(auto_file):
    candc_dir = os.environ.get('CANDC', None)
    if not candc_dir:
        die('did not find C&C parser at CANDC environmental variable.')
    CANDC_DIR = Path(candc_dir).resolve()
    GENERATE = CANDC_DIR / 'bin' / 'generate'
    MARKEDUP = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats' / 'markedup'
    CATS = CANDC_DIR / 'src' / 'data' / 'ccg' / 'cats'
    if not GENERATE.exists():
        logger.error(
            'Currently the evalution script requires C&C parser compiled from its source.'
        )
        die('expected: $CANDC/bin/generate')
    elif not MARKEDUP.exists() or not CATS.exists:
        logger.error('The C&C directory is not configured expectedly.')
        die('expected: $CANDC/src/data/ccg/cats/markedup')

    tmp = tempfile.mktemp()
    print(tmp)
    with open(tmp, 'w') as f:
        for _, tokens, tree in read_auto(auto_file):
            print(tree.auto_flat(tokens=tokens), file=f)

    command = f'{GENERATE} -j {CATS} {MARKEDUP} {tmp}'
    proc = subprocess.Popen(command,
                            shell=True,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    results, error = proc.communicate()
    if len(error.decode('utf-8')) > 0:
        die(f'caught error in running $CANDC/bin/generate: {error.decode("utf-8")}'
            )

    lines = iter(results.decode('utf-8').split('\n'))
    deps, udeps = set(), set()
    rule_ids = {}
    line = next(lines)
    while line != '':
        line = next(lines)

    for line in lines:
        line = line.strip()
        if len(line) == 0:
            # If 0, no dependencies for this sentence - probably a conversion script error.
            parsed = len(rule_ids) > 0
            yield parsed, deps, udeps, rule_ids
            deps, udeps = set(), set()
            rule_ids = {}
            continue
        fields = line.split()
        pred, cat, slot, arg, rule_id = fields[:5]
        pred_word = pred.rsplit('_')[0]
        arg_word = arg.rsplit('_')[0]
        if not ignore(pred_word, cat, slot, arg_word, rule_id):
            cat = strip_markup(cat)
            deps.add((pred, cat, slot, arg))
            rule_ids[(pred, cat, slot, arg)] = rule_id
            udeps.add((pred, arg))
Ejemplo n.º 3
0
 def convert_json(autopath):
     self = TrainingDataCreator(autopath, None, None, None)
     trees = [
         tree for _, _, tree in read_auto(self.filepath)
         if tree.word != 'FAILED'
     ]
     logger.info(f'loaded {len(trees)} trees')
     self._create_samples(trees)
     return self.samples
Ejemplo n.º 4
0
    def create_testdata(args):
        self = TrainingDataCreator(args.PATH, args.word_freq_cut,
                                   args.cat_freq_cut, args.afix_freq_cut)

        trees = [tree for _, _, tree in read_auto(self.filepath)]
        self._create_samples(trees)

        with open(args.OUT / 'testdata.json', 'w') as f:
            logger.info(f'writing to {f.name}')
            json.dump(self.samples, f)

        with open(args.OUT / 'testsents.txt', 'w') as f:
            logger.info(f'writing to {f.name}')
            for sent in self.sents:
                print(sent, file=f)

        with open(args.OUT / 'testsents.conll', 'w') as f:
            logger.info(f'writing to {f.name}')
            self._to_conll(f)
Ejemplo n.º 5
0
 def _read(self, file_path):
     for _, _, ccg_tree in read_auto(file_path):
         tree = ccg_to_nltk_tree(ccg_tree)
         pos_tags = [x[1]
                     for x in tree.pos()] if self._use_pos_tags else None
         yield self.text_to_instance(tree.leaves(), pos_tags, tree)