Esempio n. 1
0
 def make_merged_mapping(self):
     self.merged_mapping = FeatureMapping(self.mapping1)
     # amount that we shift all the feature indices in corpus2 by
     self.offset = len(self.mapping1)
     for index, name in self.mapping2.items():
         new_index = index + self.offset
         self.merged_mapping[new_index] = name
     if self.verbose:
         print 'Merged mapping: %d features.' % len(self.merged_mapping)
     self.merged_mapping.write(self.merged_mapping_filename)
Esempio n. 2
0
 def read_mappings(self):
     if self.verbose:
         print 'Reading mapping 1 (%s)' % self.mapping1
     self.mapping1 = FeatureMapping.mapping_from_filename(self.mapping1)
     if self.verbose:
         print 'Mapping 1: %d features.' % len(self.mapping1)
         print 'Reading mapping 2 (%s)' % self.mapping2
     self.mapping2 = FeatureMapping.mapping_from_filename(self.mapping2)
     if self.verbose:
         print 'Mapping 2: %d features.' % len(self.mapping2)
Esempio n. 3
0
 def read_mappings(self):
     if self.verbose:
         print 'Reading mapping 1 (%s)' % self.mapping1
     self.mapping1 = FeatureMapping.mapping_from_filename(self.mapping1)
     if self.verbose:
         print 'Mapping 1: %d features.' % len(self.mapping1)
         print 'Reading mapping 2 (%s)' % self.mapping2
     self.mapping2 = FeatureMapping.mapping_from_filename(self.mapping2)
     if self.verbose:
         print 'Mapping 2: %d features.' % len(self.mapping2)
Esempio n. 4
0
 def make_merged_mapping(self):
     self.merged_mapping = FeatureMapping(self.mapping1)
     # amount that we shift all the feature indices in corpus2 by
     self.offset = len(self.mapping1)
     for index, name in self.mapping2.items():
         new_index = index + self.offset
         self.merged_mapping[new_index] = name
     if self.verbose:
         print 'Merged mapping: %d features.' % len(self.merged_mapping)
     self.merged_mapping.write(self.merged_mapping_filename)
Esempio n. 5
0
class Merger:
    def __init__(self,
                 corpus_filename1,
                 corpus_filename2,
                 mapping1,
                 mapping2,
                 merged_corpus_filename,
                 merged_mapping_filename,
                 verbose=True,
                 warn_only=False):
        initialize(self, locals())
        self.read_mappings()
        self.make_merged_mapping()
        self.make_merged_feature_values()

    def read_mappings(self):
        if self.verbose:
            print 'Reading mapping 1 (%s)' % self.mapping1
        self.mapping1 = FeatureMapping.mapping_from_filename(self.mapping1)
        if self.verbose:
            print 'Mapping 1: %d features.' % len(self.mapping1)
            print 'Reading mapping 2 (%s)' % self.mapping2
        self.mapping2 = FeatureMapping.mapping_from_filename(self.mapping2)
        if self.verbose:
            print 'Mapping 2: %d features.' % len(self.mapping2)

    def make_merged_mapping(self):
        self.merged_mapping = FeatureMapping(self.mapping1)
        # amount that we shift all the feature indices in corpus2 by
        self.offset = len(self.mapping1)
        for index, name in self.mapping2.items():
            new_index = index + self.offset
            self.merged_mapping[new_index] = name
        if self.verbose:
            print 'Merged mapping: %d features.' % len(self.merged_mapping)
        self.merged_mapping.write(self.merged_mapping_filename)

    def make_merged_feature_values(self):
        def warn_or_error(message):
            if self.warn_only:
                print "Warning:", message
            else:
                raise ValueError(message)

        self.corpus1 = RerankerFeatureCorpus(self.corpus_filename1)
        self.corpus2 = RerankerFeatureCorpus(self.corpus_filename2)

        if len(self.corpus1) != len(self.corpus2):
            warn_or_error("Corpus 1 has %d sentences, corpus 2 has %d." %
                          (len(self.corpus1), len(self.corpus2)))

        merged_corpus = possibly_compressed_file(self.merged_corpus_filename,
                                                 'w')
        merged_corpus.write(self.corpus1.cvlm_format_header())
        sentence_iter = izip(self.corpus1, self.corpus2)
        if self.verbose:
            print 'Transforming corpora (%d sentences)' % len(self.corpus1)
            sentence_iter = display_index_every_K_items(sentence_iter,
                                                        50,
                                                        format='Sentence %s\n')
        for sentence_index, (sentence1, sentence2) in enumerate(sentence_iter):
            if len(sentence1) != len(sentence2):
                warn_or_error("Sentence %d: Corpus 1 has %d parses, corpus "
                              "2 has %d." %
                              (sentence_index, len(sentence1), len(sentence2)))

            if sentence1.gold_brackets != sentence2.gold_brackets:
                warn_or_error("Sentence %d: Corpus 1 has %d gold brackets, "
                              "corpus 2 has %d." %
                              (sentence_index, sentence1.gold_brackets,
                               sentence2.gold_brackets))

            parse_iter = enumerate(izip(sentence1, sentence2))
            for parse_index, (parse1, parse2) in parse_iter:
                if parse1.proposed_brackets != parse2.proposed_brackets:
                    warn_or_error(
                        "Sentence %d, parse %d: Corpus 1 has %d "
                        "proposed brackets, corpus 2 has %d." %
                        (sentence_index, parse_index, parse1.proposed_brackets,
                         parse2.proposed_brackets))
                if parse1.matched_brackets != parse2.matched_brackets:
                    warn_or_error(
                        "Sentence %d, parse %d: Corpus 1 has %d "
                        "matched brackets, corpus 2 has %d." %
                        (sentence_index, parse_index, parse1.matched_brackets,
                         parse2.matched_brackets))

                # add all features from parse2 to parse1 (after remapping)
                features = parse1.features
                for index, value in parse2.features.items():
                    features[index + self.offset] = value
            merged_corpus.write(sentence1.cvlm_format())
Esempio n. 6
0
class Merger:
    def __init__(self, corpus_filename1, corpus_filename2, mapping1, mapping2,
                 merged_corpus_filename, merged_mapping_filename,
                 verbose=True, warn_only=False):
        initialize(self, locals())
        self.read_mappings()
        self.make_merged_mapping()
        self.make_merged_feature_values()
    def read_mappings(self):
        if self.verbose:
            print 'Reading mapping 1 (%s)' % self.mapping1
        self.mapping1 = FeatureMapping.mapping_from_filename(self.mapping1)
        if self.verbose:
            print 'Mapping 1: %d features.' % len(self.mapping1)
            print 'Reading mapping 2 (%s)' % self.mapping2
        self.mapping2 = FeatureMapping.mapping_from_filename(self.mapping2)
        if self.verbose:
            print 'Mapping 2: %d features.' % len(self.mapping2)
    def make_merged_mapping(self):
        self.merged_mapping = FeatureMapping(self.mapping1)
        # amount that we shift all the feature indices in corpus2 by
        self.offset = len(self.mapping1)
        for index, name in self.mapping2.items():
            new_index = index + self.offset
            self.merged_mapping[new_index] = name
        if self.verbose:
            print 'Merged mapping: %d features.' % len(self.merged_mapping)
        self.merged_mapping.write(self.merged_mapping_filename)
    def make_merged_feature_values(self):
        def warn_or_error(message):
            if self.warn_only:
                print "Warning:", message
            else:
                raise ValueError(message)

        self.corpus1 = RerankerFeatureCorpus(self.corpus_filename1)
        self.corpus2 = RerankerFeatureCorpus(self.corpus_filename2)

        if len(self.corpus1) != len(self.corpus2):
            warn_or_error("Corpus 1 has %d sentences, corpus 2 has %d." %
                          (len(self.corpus1), len(self.corpus2)))

        merged_corpus = possibly_compressed_file(self.merged_corpus_filename,
                                                 'w')
        merged_corpus.write(self.corpus1.cvlm_format_header())
        sentence_iter = izip(self.corpus1, self.corpus2)
        if self.verbose:
            print 'Transforming corpora (%d sentences)' % len(self.corpus1)
            sentence_iter = display_index_every_K_items(sentence_iter, 50,
                                                        format='Sentence %s\n')
        for sentence_index, (sentence1, sentence2) in enumerate(sentence_iter):
            if len(sentence1) != len(sentence2):
                warn_or_error("Sentence %d: Corpus 1 has %d parses, corpus "
                              "2 has %d." % (sentence_index, len(sentence1),
                                             len(sentence2)))
                
            if sentence1.gold_brackets != sentence2.gold_brackets:
                warn_or_error("Sentence %d: Corpus 1 has %d gold brackets, "
                              "corpus 2 has %d." % (sentence_index,
                                                    sentence1.gold_brackets,
                                                    sentence2.gold_brackets))

            parse_iter = enumerate(izip(sentence1, sentence2))
            for parse_index, (parse1, parse2) in parse_iter:
                if parse1.proposed_brackets != parse2.proposed_brackets:
                    warn_or_error("Sentence %d, parse %d: Corpus 1 has %d "
                                  "proposed brackets, corpus 2 has %d." %
                                  (sentence_index, parse_index,
                                   parse1.proposed_brackets,
                                   parse2.proposed_brackets))
                if parse1.matched_brackets != parse2.matched_brackets:
                    warn_or_error("Sentence %d, parse %d: Corpus 1 has %d "
                                  "matched brackets, corpus 2 has %d." %
                                  (sentence_index, parse_index,
                                   parse1.matched_brackets,
                                   parse2.matched_brackets))

                # add all features from parse2 to parse1 (after remapping)
                features = parse1.features
                for index, value in parse2.features.items():
                    features[index + self.offset] = value
            merged_corpus.write(sentence1.cvlm_format())