Beispiel #1
0
    def _cut_valid_data(self):
        """Put aside part of the training set for validation."""
        train_size = len(self.train_trees)

        # we have multiple references
        if self.multiple_refs:
            num_refs, refs_stored = self._check_multiple_ref_type(train_size)

            # data stored "serially" (all different instances next to each other, then again in the
            # same order)
            if refs_stored == 'serial':
                train_tree_chunks = [
                    chunk
                    for chunk in chunk_list(self.train_trees, train_size /
                                            num_refs)
                ]
                train_da_chunks = [
                    chunk for chunk in chunk_list(self.train_das, train_size /
                                                  num_refs)
                ]
                self.valid_trees = [[
                    chunk[i] for chunk in train_tree_chunks
                ] for i in xrange(train_size / num_refs -
                                  self.validation_size, train_size / num_refs)]
                self.valid_das = train_da_chunks[0][-self.validation_size:]
                self.train_trees = sum([
                    chunk[:-self.validation_size]
                    for chunk in train_tree_chunks
                ], [])
                self.train_das = sum([
                    chunk[:-self.validation_size] for chunk in train_da_chunks
                ], [])
            # data stored in "parallel" (all synonymous instances next to each other)
            else:
                self.valid_trees = [
                    chunk for chunk in chunk_list(
                        self.train_trees[-self.validation_size *
                                         num_refs:], num_refs)
                ]
                self.valid_das = self.train_das[-self.validation_size *
                                                num_refs::num_refs]
                self.train_trees = self.train_trees[:-self.validation_size *
                                                    num_refs]
                self.train_das = self.train_das[:-self.validation_size *
                                                num_refs]

        # single validation reference
        else:
            # make "reference lists" of length 1 to accommodate for functions working
            # with multiple references
            self.valid_trees = [
                [tree] for tree in self.train_trees[-self.validation_size:]
            ]
            self.valid_das = self.train_das[-self.validation_size:]
            self.train_trees = self.train_trees[:-self.validation_size]
            self.train_das = self.train_das[:-self.validation_size]
Beispiel #2
0
    def _load_valid_data(self, valid_data_paths):
        """Load validation data from separate files (comma-separated list of files with DAs, trees,
        and optionally contexts is expected)."""
        # parse validation data file specification
        valid_data_paths = valid_data_paths.split(',')
        if len(
                valid_data_paths
        ) == 3:  # with contexts (this does not determine if they're used)
            valid_das_file, valid_trees_file, valid_context_file = valid_data_paths
        else:
            valid_das_file, valid_trees_file = valid_data_paths

        # load the validation data
        log_info('Reading DAs from ' + valid_das_file + '...')
        self.valid_das = read_das(valid_das_file)
        self.valid_trees = self._load_trees(valid_trees_file,
                                            selector=self.ref_selectors)
        if self.use_context:
            self.valid_das = self._load_contexts(self.valid_das,
                                                 valid_context_file)

        # reorder validation data for multiple references (see also _cut_valid_data)
        valid_size = len(self.valid_trees)
        if self.multiple_refs:
            num_refs, refs_stored = self._check_multiple_ref_type(valid_size)

            # serial: different instances next to each other, then synonymous in the same order
            if refs_stored == 'serial':
                valid_tree_chunks = [
                    chunk
                    for chunk in chunk_list(self.valid_trees, valid_size /
                                            num_refs)
                ]
                self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks]
                                    for i in xrange(valid_size / num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[0:valid_size / num_refs]
            # parallel: synonymous instances next to each other
            elif refs_stored == 'parallel':
                self.valid_trees = [
                    chunk for chunk in chunk_list(self.valid_trees, num_refs)
                ]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[::num_refs]

        # no multiple references; make lists of size 1 to simplify working with the data
        else:
            self.valid_trees = [[tree] for tree in self.valid_trees]
Beispiel #3
0
def sample_gen(args):
    from pytreex.core.document import Document
    opts, files = getopt(args, 'r:n:o:w:')
    num_to_generate = 1
    oracle_eval_file = None
    fname_ttrees_out = None

    for opt, arg in opts:
        if opt == '-n':
            num_to_generate = int(arg)
        elif opt == '-o':
            oracle_eval_file = arg
        elif opt == '-w':
            fname_ttrees_out = arg

    if len(files) != 2:
        sys.exit(__doc__)
    fname_cand_model, fname_da_test = files

    # load model
    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)

    ranker = candgen

    tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker})
    # generate
    log_info('Generating...')
    gen_doc = Document()
    das = read_das(fname_da_test)
    for da in das:
        for _ in xrange(num_to_generate):  # repeat generation n times
            tgen.generate_tree(da, gen_doc)

    # evaluate if needed
    if oracle_eval_file is not None:
        log_info('Evaluating oracle F1...')
        log_info('Loading gold data from ' + oracle_eval_file)
        gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector)
        gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)
        log_info('Gold data loaded.')
        correct, predicted, gold = 0, 0, 0
        for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)):
            # find best of predicted trees (in terms of F1)
            _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g
                                 in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree),
                                        gen_trees)],
                                key=lambda x: x[0])
            correct += tc
            predicted += tp
            gold += tg
        # evaluate oracle F1
        log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold))
    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Beispiel #4
0
def sample_gen(args):
    from pytreex.core.document import Document
    opts, files = getopt(args, 'r:n:o:w:')
    num_to_generate = 1
    oracle_eval_file = None
    fname_ttrees_out = None

    for opt, arg in opts:
        if opt == '-n':
            num_to_generate = int(arg)
        elif opt == '-o':
            oracle_eval_file = arg
        elif opt == '-w':
            fname_ttrees_out = arg

    if len(files) != 2:
        sys.exit(__doc__)
    fname_cand_model, fname_da_test = files

    # load model
    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)

    ranker = candgen

    tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker})
    # generate
    log_info('Generating...')
    gen_doc = Document()
    das = read_das(fname_da_test)
    for da in das:
        for _ in xrange(num_to_generate):  # repeat generation n times
            tgen.generate_tree(da, gen_doc)

    # evaluate if needed
    if oracle_eval_file is not None:
        log_info('Evaluating oracle F1...')
        log_info('Loading gold data from ' + oracle_eval_file)
        gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector)
        gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)
        log_info('Gold data loaded.')
        correct, predicted, gold = 0, 0, 0
        for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)):
            # find best of predicted trees (in terms of F1)
            _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g
                                 in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree),
                                        gen_trees)],
                                key=lambda x: x[0])
            correct += tc
            predicted += tp
            gold += tg
        # evaluate oracle F1
        log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold))
    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Beispiel #5
0
    def _cut_valid_data(self):
        """Put aside part of the training set for validation."""
        train_size = len(self.train_trees)

        # we have multiple references
        if self.multiple_refs:
            num_refs, refs_stored = self._check_multiple_ref_type(train_size)

            # data stored "serially" (all different instances next to each other, then again in the
            # same order)
            if refs_stored == 'serial':
                train_tree_chunks = [chunk for chunk in
                                     chunk_list(self.train_trees, train_size / num_refs)]
                train_da_chunks = [chunk for chunk in
                                   chunk_list(self.train_das, train_size / num_refs)]
                self.valid_trees = [[chunk[i] for chunk in train_tree_chunks]
                                    for i in xrange(train_size / num_refs - self.validation_size,
                                                    train_size / num_refs)]
                self.valid_das = train_da_chunks[0][-self.validation_size:]
                self.train_trees = sum([chunk[:-self.validation_size]
                                        for chunk in train_tree_chunks], [])
                self.train_das = sum([chunk[:-self.validation_size]
                                      for chunk in train_da_chunks], [])
            # data stored in "parallel" (all synonymous instances next to each other)
            else:
                self.valid_trees = [chunk for chunk in
                                    chunk_list(self.train_trees[-self.validation_size * num_refs:],
                                               num_refs)]
                self.valid_das = self.train_das[-self.validation_size * num_refs::num_refs]
                self.train_trees = self.train_trees[:-self.validation_size * num_refs]
                self.train_das = self.train_das[:-self.validation_size * num_refs]

        # single validation reference
        else:
            # make "reference lists" of length 1 to accommodate for functions working
            # with multiple references
            self.valid_trees = [[tree] for tree in self.train_trees[-self.validation_size:]]
            self.valid_das = self.train_das[-self.validation_size:]
            self.train_trees = self.train_trees[:-self.validation_size]
            self.train_das = self.train_das[:-self.validation_size]
Beispiel #6
0
    def _load_valid_data(self, valid_data_paths):
        """Load validation data from separate files (comma-separated list of files with DAs, trees,
        and optionally contexts is expected)."""
        # parse validation data file specification
        valid_data_paths = valid_data_paths.split(',')
        if len(valid_data_paths) == 3:  # with contexts (this does not determine if they're used)
            valid_das_file, valid_trees_file, valid_context_file = valid_data_paths
        else:
            valid_das_file, valid_trees_file = valid_data_paths

        # load the validation data
        log_info('Reading DAs from ' + valid_das_file + '...')
        self.valid_das = read_das(valid_das_file)
        self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors)
        if self.use_context:
            self.valid_das = self._load_contexts(self.valid_das, valid_context_file)

        # reorder validation data for multiple references (see also _cut_valid_data)
        valid_size = len(self.valid_trees)
        if self.multiple_refs:
            num_refs, refs_stored = self._check_multiple_ref_type(valid_size)

            # serial: different instances next to each other, then synonymous in the same order
            if refs_stored == 'serial':
                valid_tree_chunks = [chunk for chunk in
                                     chunk_list(self.valid_trees, valid_size / num_refs)]
                self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks]
                                    for i in xrange(valid_size / num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[0:valid_size / num_refs]
            # parallel: synonymous instances next to each other
            elif refs_stored == 'parallel':
                self.valid_trees = [chunk for chunk in chunk_list(self.valid_trees, num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[::num_refs]

        # no multiple references; make lists of size 1 to simplify working with the data
        else:
            self.valid_trees = [[tree] for tree in self.valid_trees]