Esempio n. 1
0
 def finalize(self):
     CorpusFile.finalize(self)
     self.reference.finalize()
     print('Wrote results to', self.path, file=self.logger)
     print('Wrote reference to', self.reference.path, file=self.logger)
     for i, sec in enumerate(self.secondaries):
         sec.finalize()
         print('Wrote sec %d to ' % i, sec.path, file=self.logger)
Esempio n. 2
0
 def __init__(self, experiment, path=None, directory=None, logger=None, secondary_scores=0):
     ConstituentScorer.__init__(self)
     _, path = tempfile.mkstemp(dir=directory) if path is None else path
     CorpusFile.__init__(self, path=path, directory=directory, logger=logger)
     self.experiment = experiment
     self.reference = CorpusFile(directory=directory, logger=logger)
     self.logger = logger if logger is not None else sys.stdout
     self.secondaries = [CorpusFile(directory=directory, logger=logger) for _ in range(secondary_scores)]
def main():
    # path to corpus and ids of first/last sentences of sections
    train_dev_corpus_path = '../res/osdp-12/sdp/2015/en.dm.sdp'
    training_last = 21999042
    dev_start = 22000001
    # limit corpus sizes for testing purpose
    train_limit = 100
    dev_limit = 50

    def terminal_labeling(x):
        return '_', '_', x[2], x[3]

    def terminal_labeling_lcfrs(x):
        return x[2]

    def rec_part_strat(graph):
        # return left_branching_partitioning(len(graph.sentence))
        direct = extract_recursive_partitioning(graph)
        # return direct
        return fanout_limited_partitioning_left_to_right(direct, 1)

    def nt_sub_labeling(edge):
        return edge.label[2]

    def nonterminal_labeling(x, graph):
        bot = graph.dog.bottom(x)
        top = graph.dog.top(x)

        def labels(nodes):
            return [
                induction_settings.nt_sub_labeling(
                    graph.dog.incoming_edge(node)) for node in nodes
            ]

        fanout = consecutive_spans(graph.covered_sentence_positions(x))

        return '[' + ','.join(labels(bot)) + ';' + ','.join(
            labels(top)) + ';' + str(fanout) + ']'

    induction_settings = InductionSettings()
    induction_settings.terminal_labeling = terminal_labeling
    induction_settings.terminal_labeling_lcfrs = terminal_labeling_lcfrs
    induction_settings.rec_part_strat = rec_part_strat
    induction_settings.nt_sub_labeling = nt_sub_labeling
    induction_settings.nonterminal_labeling = nonterminal_labeling

    experiment = SDPExperiment(induction_settings)
    experiment.resources['TRAIN'] = CorpusFile(train_dev_corpus_path,
                                               end=training_last,
                                               limit=train_limit)
    experiment.resources['TEST'] = CorpusFile(train_dev_corpus_path,
                                              start=dev_start,
                                              limit=dev_limit)
    experiment.oracle_parsing = True
    experiment.parsing_timeout = 150  # seconds

    experiment.run_experiment()
Esempio n. 4
0
class ScorerAndWriter(ConstituentScorer, CorpusFile):
    """
    A resource to which parsing results can be written.
    Computes LF1 score (inhouse implementation) and writes resulting parse tree to a file.
    """
    def __init__(self, experiment, path=None, directory=None, logger=None, secondary_scores=0):
        ConstituentScorer.__init__(self)
        _, path = tempfile.mkstemp(dir=directory) if path is None else path
        CorpusFile.__init__(self, path=path, directory=directory, logger=logger)
        self.experiment = experiment
        self.reference = CorpusFile(directory=directory, logger=logger)
        self.logger = logger if logger is not None else sys.stdout
        self.secondaries = [CorpusFile(directory=directory, logger=logger) for _ in range(secondary_scores)]

    def init(self):
        CorpusFile.init(self)
        self.reference.init()
        for sec in self.secondaries:
            sec.init()

    def finalize(self):
        CorpusFile.finalize(self)
        self.reference.finalize()
        print('Wrote results to', self.path, file=self.logger)
        print('Wrote reference to', self.reference.path, file=self.logger)
        for i, sec in enumerate(self.secondaries):
            sec.finalize()
            print('Wrote sec %d to ' % i, sec.path, file=self.logger)

    def score(self, system, gold, secondaries=None):
        ConstituentScorer.score(self, system, gold)
        self.file.writelines(self.experiment.serialize(system))
        self.reference.file.writelines(self.experiment.serialize(gold))
        if secondaries:
            for system_sec, corpus in zip(secondaries, self.secondaries):
                corpus.file.writelines(self.experiment.serialize(system_sec))

    def failure(self, gold):
        ConstituentScorer.failure(self, gold)
        sentence = self.experiment.obtain_sentence(gold)
        label = self.experiment.obtain_label(gold)
        fallback = self.experiment.compute_fallback(sentence, label)
        self.file.writelines(self.experiment.serialize(fallback))
        self.reference.file.writelines(self.experiment.serialize(gold))
        for sec in self.secondaries:
            sec.file.writelines(self.experiment.serialize(fallback))

    def __str__(self):
        return CorpusFile.__str__(self)
Esempio n. 5
0
    def run_discodop_binarization(self):
        """
        :rtype: None
        Binarize the training corpus using discodop. The resulting corpus is saved to to the the
        disco_binarized_corus member variable.
        """
        if self.disco_binarized_corpus is not None:
            return
        train_resource = self.resources[TRAINING]
        if self.induction_settings.normalize:
            train_normalized = self.normalize_corpus(train_resource.path, src=train_resource.type.lower(), renumber=False)
        else:
            train_normalized = train_resource.path

        _, second_stage = tempfile.mkstemp(suffix=".export", dir=self.directory)

        subprocess.call(["discodop", "treetransforms"]
                        + self.induction_settings.discodop_binarization_params
                        + ["--inputfmt=export", "--outputfmt=export",
                           train_normalized, second_stage])

        disco_resource = CorpusFile(path=second_stage,
                                    start=train_resource.start,
                                    end=train_resource.end,
                                    limit=train_resource.limit,
                                    filter=train_resource.filter,
                                    exclude=train_resource.exclude,
                                    type=train_resource.type
                                   )

        self.disco_binarized_corpus = self.read_corpus_export(disco_resource, mode="DISCO-DOP", skip_normalization=True)
    def evaluate(self, result_resource, gold_resource):
        if gold_resource.end is not None \
                or gold_resource.limit is not None\
                or gold_resource.length_limit is not None:
            corpus_gold_selection = self.read_corpus(gold_resource)
            gold_selection_resource = CorpusFile()
            gold_selection_resource.init()
            gold_selection_resource.finalize()
            export_corpus(corpus_gold_selection, gold_selection_resource.path)
            gold_resource = gold_selection_resource

        call([
            "sh", "../util/semeval-run.sh", "Scorer", gold_resource.path,
            result_resource.path, "representation=DM"
        ])
 def __init__(self, induction_settings):
     Experiment.__init__(self)
     self.induction_settings = induction_settings
     self.resources[RESULT] = CorpusFile(header="#SDP 2015\n")
Esempio n. 8
0
def setup_corpus_resources(split, dev_mode=True, quick=False, test_pred=False, test_second_half=False):
    """
    :param split: A string specifying a particular corpus and split from the literature.
    :type split: str
    :param dev_mode: If true, then the development set is used for testing.
    :type dev_mode: bool
    :param quick: If true, then a smaller version of the corpora are returned.
    :type quick: bool
    :param test_pred: If true, then predicted POS tags are used for testing.
    :type test_pred: bool
    :return: A tuple with train/dev/test (in this order) of type CorpusResource
    """
    if split == "SPMRL":
        # all files are from SPMRL shared task

        corpus_type = corpus_type_test = "TIGERXML"
        train_path = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml'
        train_start = 1
        train_filter = None
        train_limit = 40474
        train_exclude = validation_exclude = test_exclude = test_input_exclude = [7561, 17632, 46234, 50224]

        validation_path = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml'
        validation_start = 40475
        validation_size = validation_start + 4999
        validation_filter = None

        if dev_mode:
            test_start = test_input_start = validation_start
            test_limit = test_input_limit = validation_size
            test_path = test_input_path \
                = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml'
        else:
            test_start = test_input_start = 45475
            test_limit = test_input_limit = test_start + 4999
            test_path = test_input_path \
                = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/test/test.German.gold.xml'
        test_filter = test_input_filter = None

        if quick:
            train_path = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train5k/train5k.German.gold.xml'
            train_limit = train_start + 2000
            validation_size = validation_start + 200
            test_limit = test_input_limit = test_start + 200
    #
    elif split == "HN08":
        # files are based on the scripts in Coavoux's mind the gap 1.0
        # where we commented out `rm -r tiger21 tiger22 marmot_tags` in generate_tiger_data.sh

        corpus_type = corpus_type_test = "EXPORT"
        base_path = "res/TIGER/tiger21"
        train_start = 1
        train_limit = 50474

        train_path = os.path.join(base_path, "tigertraindev_root_attach.export")

        def train_filter(x):
            return x % 10 >= 2

        train_exclude = [7561, 17632, 46234, 50224]

        validation_start = 1
        validation_size = 50471
        validation_exclude = train_exclude
        validation_path = os.path.join(base_path, "tigerdev_root_attach.export")
        validation_exclude = train_exclude

        def validation_filter(sent_id):
            return sent_id % 10 == 1

        if not dev_mode:
            test_start = test_input_start = 1  # validation_size  # 40475
            if test_second_half:
                test_start = test_input_start = 25240
            test_limit = test_input_limit = 50474
            # test_limit = 200 * 5 // 4
            test_exclude = test_input_exclude = train_exclude
            test_path = os.path.join(base_path, "tigertest_root_attach.export")

            def test_filter(sent_id):
                return sent_id % 10 == 0

            if test_pred:
                corpus_type_test = "WORD/POS"
                test_input_start = 0
                if test_second_half:
                    test_input_start = 2524 - 1
                # predicted by MATE trained on tigerHN08 train + dev
                test_input_path = 'res/TIGER/tigerHN08-test.train+dev.pred_tags.raw'
                test_input_filter = None
            else:
                test_input_path = test_path
                test_input_filter = test_filter

        else:
            test_start = test_input_start = 1
            if test_second_half:
                test_start = test_input_start = 25241
            test_limit = test_input_limit = 50474
            test_exclude = test_input_exclude = train_exclude
            test_path = validation_path
            test_filter = validation_filter

            if test_pred:
                corpus_type_test = "WORD/POS"
                test_input_start = 0
                if test_second_half:
                    test_input_start = 2524
                # predicted by MATE trained on tigerHN08 train
                test_input_path = 'res/TIGER/tigerHN08-dev.train.pred_tags.raw'
                test_input_filter = None
            else:
                test_input_path = validation_path
                test_input_filter = test_filter

        if quick:
            train_limit = 5000 * 5 // 4
            validation_size = 200 * 10 // 1
            TEST_LIMIT = 200
            test_limit = test_input_limit = TEST_LIMIT * 10 // 1
            if test_pred:
                test_input_limit = TEST_LIMIT + 1
    #
    elif "WSJ" in split:
        # based on Kilian Evang's dptb.tar.bz2

        corpus_type = corpus_type_test = "EXPORT"
        corpus_path_original = "res/WSJ/ptb-discontinuous/dptb7.export"
        corpus_path_km2003 = "res/WSJ/ptb-discontinuous/dptb7-km2003wsj.export"

        # obtain the km2003 version from by running
        # discodop treetransforms --transforms=km2003wsj corpus_path_original corpus_path_km2003

        if "km2003" in split:
            corpus_path = corpus_path_km2003
        else:
            corpus_path = corpus_path_original

        train_path = validation_path = test_path = test_input_path = corpus_path
        train_exclude = validation_exclude = test_exclude = test_input_exclude = []
        train_filter = validation_filter = test_filter = test_input_filter = None

        # sections 2-21
        train_start = 3915
        train_limit = 43746

        # section 24
        validation_start = 47863
        validation_size = 49208

        if not dev_mode:
            # section 23
            test_start = test_input_start = 45447
            test_limit = test_input_limit = 47862
        else:
            test_start = test_input_start = validation_start
            test_limit = test_input_limit = validation_size

        if quick:
            train_limit = train_start + 2000
            validation_size = validation_start + 200
            test_limit = test_input_limit = test_start + 200
    else:
        raise ValueError("Unknown split: " + str(split))

    train = CorpusFile(path=train_path, start=train_start, end=train_limit, exclude=train_exclude, filter=train_filter,
                       type=corpus_type)
    dev = CorpusFile(path=validation_path, start=validation_start, end=validation_size, exclude=validation_exclude,
                     filter=validation_filter, type=corpus_type)
    test = CorpusFile(path=test_path, start=test_start, end=test_limit, exclude=test_exclude, filter=test_filter,
                      type=corpus_type)
    test_input = CorpusFile(path=test_input_path,
                            start=test_input_start,
                            end=test_input_limit,
                            exclude=test_input_exclude,
                            filter=test_input_filter,
                            type=corpus_type_test)

    return train, dev, test, test_input
Esempio n. 9
0
 def __str__(self):
     return CorpusFile.__str__(self)
Esempio n. 10
0
 def init(self):
     CorpusFile.init(self)
     self.reference.init()
     for sec in self.secondaries:
         sec.init()
def main2():
    induction_settings = InductionSettings()

    # terminal labeling
    induction_settings.terminal_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return induction_settings.terminal_labeling_token.token_label(
                token)
        else:
            return token

    induction_settings.terminal_labeling = term_labeling

    # recursive partitioning
    def rec_part_strategy(direction, subgrouping, fanout, binarize):
        if direction == "right-to-left":
            return lambda dsg: fanout_limited_partitioning(
                dsg.recursive_partitioning(subgrouping, weak=binarize), fanout)
        else:
            return lambda dsg: fanout_limited_partitioning_left_to_right(
                dsg.recursive_partitioning(subgrouping, weak=binarize), fanout)

    induction_settings.binarize = True
    induction_settings.direction = "left-to-right"
    induction_settings.subgrouping = False
    induction_settings.fanout = 1
    induction_settings.rec_part_strategy = rec_part_strategy(
        induction_settings.direction, induction_settings.subgrouping,
        induction_settings.fanout, induction_settings.binarize)

    # Nonterminal Labeling
    induction_settings.start = "START"

    def label_edge(edge):
        if isinstance(edge.label, ConstituentTerminal):
            return edge.label.pos()
        else:
            return edge.label

    def stupid_edge(edge):
        return "X"

    def label_child(edge, j):
        return edge.get_function(j)

    def simple_nonterminal_labeling(nodes, dsg):
        return simple_labeling(nodes, dsg, label_edge)

    def bot_stupid_nonterminal_labeling(nodes, dsg):
        return top_bot_labeling(nodes, dsg, label_edge, stupid_edge)

    def missing_child_nonterminal_labeling(nodes, dsg):
        return missing_child_labeling(nodes, dsg, label_edge, label_child)

    induction_settings.nonterminal_labeling = simple_nonterminal_labeling
    induction_settings.normalize = True

    experiment = DOGExperiment(induction_settings)

    # Corpora
    start = 1
    stop = 2000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []

    experiment.resources[TRAINING] = CorpusFile(corpus_path, start, stop)
    experiment.resources[TESTING] = CorpusFile(corpus_path, test_start,
                                               test_stop)
    experiment.oracle_parsing = True
    experiment.purge_rule_freq = None  # 1.0
    experiment.k_best = 100
    experiment.run_experiment()