def __init__(self, filename):
        initialize(self, locals())

        self.reader = iter(possibly_compressed_file(filename))
        self.header = parse_kv_list(self.reader.next())
        assert 'S' in self.header
        self.num_sentences = self.header['S']
    def __init__(self, filename):
        initialize(self, locals())

        self.reader = iter(possibly_compressed_file(filename))
        self.header = parse_kv_list(next(self.reader))
        assert 'S' in self.header
        self.num_sentences = self.header['S']
 def mapping_from_filename(this_class, filename):
     """Reads cvlm feature mapping from a filename. The expected
     format is that each line has an index followed by a tab followed
     by the feature name. Returns a FeatureMapping."""
     mapping = this_class()
     for line in possibly_compressed_file(filename):
         index, name = line.split('\t')
         index = int(index)
         mapping[index] = name.strip()
     return mapping
 def mapping_from_filename(this_class, filename):
     """Reads cvlm feature mapping from a filename. The expected
     format is that each line has an index followed by a tab followed
     by the feature name. Returns a FeatureMapping."""
     mapping = this_class()
     for line in possibly_compressed_file(filename):
         index, name = line.split('\t')
         index = int(index)
         mapping[index] = name.strip()
     return mapping
Exemple #5
0
    def make_merged_feature_values(self):
        def warn_or_error(message):
            if self.warn_only:
                print "Warning:", message
            else:
                raise ValueError(message)

        self.corpus1 = RerankerFeatureCorpus(self.corpus_filename1)
        self.corpus2 = RerankerFeatureCorpus(self.corpus_filename2)

        if len(self.corpus1) != len(self.corpus2):
            warn_or_error("Corpus 1 has %d sentences, corpus 2 has %d." %
                          (len(self.corpus1), len(self.corpus2)))

        merged_corpus = possibly_compressed_file(self.merged_corpus_filename,
                                                 'w')
        merged_corpus.write(self.corpus1.cvlm_format_header())
        sentence_iter = izip(self.corpus1, self.corpus2)
        if self.verbose:
            print 'Transforming corpora (%d sentences)' % len(self.corpus1)
            sentence_iter = display_index_every_K_items(sentence_iter,
                                                        50,
                                                        format='Sentence %s\n')
        for sentence_index, (sentence1, sentence2) in enumerate(sentence_iter):
            if len(sentence1) != len(sentence2):
                warn_or_error("Sentence %d: Corpus 1 has %d parses, corpus "
                              "2 has %d." %
                              (sentence_index, len(sentence1), len(sentence2)))

            if sentence1.gold_brackets != sentence2.gold_brackets:
                warn_or_error("Sentence %d: Corpus 1 has %d gold brackets, "
                              "corpus 2 has %d." %
                              (sentence_index, sentence1.gold_brackets,
                               sentence2.gold_brackets))

            parse_iter = enumerate(izip(sentence1, sentence2))
            for parse_index, (parse1, parse2) in parse_iter:
                if parse1.proposed_brackets != parse2.proposed_brackets:
                    warn_or_error(
                        "Sentence %d, parse %d: Corpus 1 has %d "
                        "proposed brackets, corpus 2 has %d." %
                        (sentence_index, parse_index, parse1.proposed_brackets,
                         parse2.proposed_brackets))
                if parse1.matched_brackets != parse2.matched_brackets:
                    warn_or_error(
                        "Sentence %d, parse %d: Corpus 1 has %d "
                        "matched brackets, corpus 2 has %d." %
                        (sentence_index, parse_index, parse1.matched_brackets,
                         parse2.matched_brackets))

                # add all features from parse2 to parse1 (after remapping)
                features = parse1.features
                for index, value in parse2.features.items():
                    features[index + self.offset] = value
            merged_corpus.write(sentence1.cvlm_format())
 def weights_from_filename(this_class, filename):
     """Reads cvlm weight vectors from a filename. The expected format
     is that each line has an index followed by an equals sign followed
     by the feature weight (a float). Returns a FeatureMapping."""
     weights = this_class()
     for line in possibly_compressed_file(filename):
         index, weight = line.split('=')
         index = int(index)
         weight = float(weight)
         weights[index] = weight
     return weights
 def weights_from_filename(this_class, filename):
     """Reads cvlm weight vectors from a filename. The expected format
     is that each line has an index followed by an equals sign followed
     by the feature weight (a float). Returns a FeatureMapping."""
     weights = this_class()
     for line in possibly_compressed_file(filename):
         index, weight = line.split('=')
         index = int(index)
         weight = float(weight)
         weights[index] = weight
     return weights
    def make_merged_feature_values(self):
        def warn_or_error(message):
            if self.warn_only:
                print "Warning:", message
            else:
                raise ValueError(message)

        self.corpus1 = RerankerFeatureCorpus(self.corpus_filename1)
        self.corpus2 = RerankerFeatureCorpus(self.corpus_filename2)

        if len(self.corpus1) != len(self.corpus2):
            warn_or_error("Corpus 1 has %d sentences, corpus 2 has %d." %
                          (len(self.corpus1), len(self.corpus2)))

        merged_corpus = possibly_compressed_file(self.merged_corpus_filename,
                                                 'w')
        merged_corpus.write(self.corpus1.cvlm_format_header())
        sentence_iter = izip(self.corpus1, self.corpus2)
        if self.verbose:
            print 'Transforming corpora (%d sentences)' % len(self.corpus1)
            sentence_iter = display_index_every_K_items(sentence_iter, 50,
                                                        format='Sentence %s\n')
        for sentence_index, (sentence1, sentence2) in enumerate(sentence_iter):
            if len(sentence1) != len(sentence2):
                warn_or_error("Sentence %d: Corpus 1 has %d parses, corpus "
                              "2 has %d." % (sentence_index, len(sentence1),
                                             len(sentence2)))
                
            if sentence1.gold_brackets != sentence2.gold_brackets:
                warn_or_error("Sentence %d: Corpus 1 has %d gold brackets, "
                              "corpus 2 has %d." % (sentence_index,
                                                    sentence1.gold_brackets,
                                                    sentence2.gold_brackets))

            parse_iter = enumerate(izip(sentence1, sentence2))
            for parse_index, (parse1, parse2) in parse_iter:
                if parse1.proposed_brackets != parse2.proposed_brackets:
                    warn_or_error("Sentence %d, parse %d: Corpus 1 has %d "
                                  "proposed brackets, corpus 2 has %d." %
                                  (sentence_index, parse_index,
                                   parse1.proposed_brackets,
                                   parse2.proposed_brackets))
                if parse1.matched_brackets != parse2.matched_brackets:
                    warn_or_error("Sentence %d, parse %d: Corpus 1 has %d "
                                  "matched brackets, corpus 2 has %d." %
                                  (sentence_index, parse_index,
                                   parse1.matched_brackets,
                                   parse2.matched_brackets))

                # add all features from parse2 to parse1 (after remapping)
                features = parse1.features
                for index, value in parse2.features.items():
                    features[index + self.offset] = value
            merged_corpus.write(sentence1.cvlm_format())
 def write(self, filename):
     f = possibly_compressed_file(filename, 'w')
     for index in range(len(self)):
         name = self[index]
         f.write('%d\t%s\n' % (index, name))
     f.close()
 def write(self, filename):
     f = possibly_compressed_file(filename, 'w')
     for index in range(len(self)):
         name = self[index]
         f.write('%d\t%s\n' % (index, name))
     f.close()
Exemple #11
0
def train(train_data, dev_data, output_dir, mode='parser',
    train_bin_dir=CURRENT_TRAIN_BIN, original_data=None, verbose=True,
    cat_alternative=None, keep_tempfiles=False):
    """Create a language model / parsing model in output_dir from
    train_data and dev_data.  train_bin_dir is the directory
    containing allScript and the training binaries.  We use original_data
    as our prototype for the model directory, and while most of its
    contents are unimportant, some files like terms.txt are relevant.
    
    To use cat_alternative, you'll need dmcc's version of allScript.
    This lets you specify zcat, bzcat, smartcat, etc. for reading in
    files."""
    if isinstance(train_data, basestring):
        train_data = [train_data]
    if isinstance(dev_data, basestring):
        dev_data = [dev_data]

    assert mode in ('parser', 'lm')
        
    for train_or_dev_filename in train_data + dev_data:
        assert os.path.exists(train_or_dev_filename), \
            "File %s doesn't exist." % train_or_dev_filename

    allScript = os.path.join(train_bin_dir, 'allScript')
    assert os.path.exists(allScript)
    if original_data is None:
        if mode == 'parser':
            original_data = DEFAULT_DATA
        else:
            original_data = DEFAULT_LM
    # output_dir = validate_and_cleanup_datadir_path(output_dir)

    import shutil, commands
    from iterextras import any
    from waterworks.Files import possibly_compressed_file
    # erase the output directory if it exists and remake it from our
    # original_data directory (which should be a clean training of WSJ or
    # switchboard -- it must have the right terms.txt, etc.)
    if output_dir != original_data:
        print "Removing", output_dir
        shutil.rmtree(output_dir, ignore_errors=True)
        print "Copying", original_data, "to", output_dir
        shutil.copytree(original_data, output_dir)

    def compressed_filename(filename):
        filename = filename.lower()
        return filename.endswith('.gz') or filename.endswith('.bz2')

    temp_files = []
    modelbase = "%s." % os.path.basename(output_dir)
    # if there are any compressed files in training, we combined all
    # training into one uncompressed file
    if any(train_data, compressed_filename):
        temp_train = keepable_tempfile(mode='w', prefix=modelbase,
                                       suffix='.train', keep=True, dir='/ltmp')
        print "Uncompressing and combining training data to", temp_train.name
        for filename in train_data:
            f = possibly_compressed_file(filename)
            for line in f:
                temp_train.write(line)
        temp_train.close()
        train_data = [temp_train.name]
        temp_files.append(temp_train)

    # same for dev files
    if any(dev_data, compressed_filename):
        temp_dev = keepable_tempfile(mode='w', prefix=modelbase,
                                     suffix='.dev', keep=True, dir='/ltmp')
        print "Uncompressing and combining dev data to", temp_dev.name
        for filename in dev_data:
            f = possibly_compressed_file(filename)
            for line in f:
                temp_dev.write(line)
        temp_dev.close()
        dev_data = [temp_dev.name]
        temp_files.append(temp_dev)

    # the repr()s will put quotes around lists of arguments
    cmd = ' '.join([allScript, 
                    '-' + mode, 
                    output_dir, 
                    repr(' '.join(train_data)), 
                    repr(' '.join(dev_data))])

    if verbose:
        print "Training command:", repr(cmd)
    
    status, output = commands.getstatusoutput(cmd)

    if verbose:
        print "Output:"
        print "-------"
        print output
        print "-------"

    # store training output
    f = file(os.path.join(output_dir, 'traininglog'), 'a')
    f.write(output)
    f.close()

    if not keep_tempfiles:
        print "Removing temporary training files..."
        for fileobj in temp_files:
            os.remove(fileobj.name)

    if status != 0:
        raise TrainingError("Training script exited with nonzero exit code.")

    warning_messages = ('Exit code: 134', 'Exit code: 137', 'segfault', 'abort',
                        'Could not find', "Assertion `pstStream' failed.")
    for message in warning_messages:
        if message.lower() in output.lower():
            raise TrainingError("Found a warning message in training " + \
                                "output: %r" % message)

    print "Done"
    return output