Esempio n. 1
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    else:
        # Create output dir if necessary
        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)

    extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
                      '--train_text', options.corpus_stem,
                      '--ngramize', '1',
                      '--ngram_size', str(options.ngram_size),
                      '--vocab_size', str(options.vocab_size),
                      '--write_words_file', os.path.join(options.working_dir, options.words_file),
                      '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized')
                      ]

    sys.stderr.write('extracting n-grams\n')
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")
    
    if options.validation_corpus:

        extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
                          '--train_text', options.validation_corpus,
                          '--ngramize', '1',
                          '--ngram_size', str(options.ngram_size),
                          '--vocab_size', str(options.vocab_size),
                          '--words_file', os.path.join(options.working_dir, options.words_file),
                          '--train_file', os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
                          ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")

    else:
        options.validation_file = None

    options.input_words_file = options.words_file
    options.output_words_file = options.words_file
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    average_options = averageNullEmbedding.parser.parse_args(
        ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
         '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'),
         '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
         '-p', os.path.join(options.nplm_home, 'python')])
    averageNullEmbedding.main(average_options)
Esempio n. 2
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    vocab_file = os.path.join(options.working_dir, options.words_file)
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = []
    if options.train_host:
        extraction_cmd = ["ssh", options.train_host]
    extraction_cmd += [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem, '--ngramize', '1', '--ngram_size',
        str(options.ngram_size), '--vocab_size',
        str(options.vocab_size), '--write_words_file', vocab_file,
        '--train_file',
        os.path.join(options.working_dir, numberized_file)
    ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    subprocess.check_call(extraction_cmd)

    # if dropout enabled, need to check which is the <null> vocab id
    null_id = None
    if options.dropout or options.input_dropout:
        with open(vocab_file) as vfh:
            for i, line in enumerate(vfh):
                if line[:-1].decode("utf8") == "<null>":
                    null_id = i
                    break
        if null_id == None:
            sys.stderr.write(
                "WARN: could not identify null token, cannot enable dropout\n")
        else:
            if not options.extra_settings:
                options.extra_settings = ""
            if options.dropout or options.input_dropout:
                options.extra_settings += " --null_index %d " % null_id
            if options.dropout:
                options.extra_settings += " --dropout %s " % options.dropout
            if options.input_dropout:
                options.extra_settings += " --input_dropout %s " % options.input_dropout

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = []
        if options.train_host:
            mmap_cmd = ["ssh", options.train_host]
        mmap_cmd += [
            os.path.join(options.nplm_home, 'src',
                         'createMmap'), '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
        ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = []
        if options.train_host:
            extraction_cmd = ["ssh", options.train_host]
        extraction_cmd += [
            os.path.join(options.nplm_home, 'src',
                         'prepareNeuralLM'), '--train_text',
            options.validation_corpus, '--ngramize', '1', '--ngram_size',
            str(options.ngram_size), '--vocab_size',
            str(options.vocab_size), '--words_file', vocab_file,
            '--train_file',
            os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
        ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = vocab_file
    options.output_words_file = vocab_file
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    output_model_file = os.path.join(options.output_dir,
                                     options.output_model + '.model.nplm.best')
    if not os.path.exists(output_model_file):
        output_model_file = os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs))
    average_options = averageNullEmbedding.parser.parse_args([
        '-i',
        output_model_file,
        '-o',
        os.path.join(options.output_dir, options.output_model + '.model.nplm'),
        '-t',
        os.path.join(options.working_dir, numberized_file),
        '-p',
        os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)
Esempio n. 3
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + ".numberized"
    train_file = numberized_file
    if options.mmap:
        train_file += ".mmap"

    extraction_cmd = [
        os.path.join(options.nplm_home, "src", "prepareNeuralLM"),
        "--train_text",
        options.corpus_stem,
        "--ngramize",
        "1",
        "--ngram_size",
        str(options.ngram_size),
        "--vocab_size",
        str(options.vocab_size),
        "--write_words_file",
        os.path.join(options.working_dir, options.words_file),
        "--train_file",
        os.path.join(options.working_dir, numberized_file),
    ]

    sys.stderr.write("extracting n-grams\n")
    sys.stderr.write("executing: " + ", ".join(extraction_cmd) + "\n")
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, "src", "createMmap"),
            "--input_file",
            os.path.join(options.working_dir, numberized_file),
            "--output_file",
            os.path.join(options.working_dir, train_file),
        ]
        sys.stderr.write("creating memory-mapped file\n")
        sys.stderr.write("executing: " + ", ".join(mmap_cmd) + "\n")
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, "src", "prepareNeuralLM"),
            "--train_text",
            options.validation_corpus,
            "--ngramize",
            "1",
            "--ngram_size",
            str(options.ngram_size),
            "--vocab_size",
            str(options.vocab_size),
            "--words_file",
            os.path.join(options.working_dir, options.words_file),
            "--train_file",
            os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + ".numberized"),
        ]

        sys.stderr.write("extracting n-grams (validation file)\n")
        sys.stderr.write("executing: " + ", ".join(extraction_cmd) + "\n")
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = os.path.join(options.working_dir, options.words_file)
    options.output_words_file = os.path.join(options.working_dir, options.words_file)
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write("training neural network\n")
    train_nplm.main(options)

    sys.stderr.write("averaging null words\n")
    output_model_file = os.path.join(options.output_dir, options.output_model + ".model.nplm.best")
    if not os.path.exists(output_model_file):
        output_model_file = os.path.join(
            options.output_dir, options.output_model + ".model.nplm." + str(options.epochs)
        )
    average_options = averageNullEmbedding.parser.parse_args(
        [
            "-i",
            output_model_file,
            "-o",
            os.path.join(options.output_dir, options.output_model + ".model.nplm"),
            "-t",
            os.path.join(options.working_dir, numberized_file),
            "-p",
            os.path.join(options.nplm_home, "python"),
        ]
    )
    averageNullEmbedding.main(average_options)
Esempio n. 4
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem, '--ngramize', '1', '--ngram_size',
        str(options.ngram_size), '--vocab_size',
        str(options.vocab_size), '--write_words_file',
        os.path.join(options.working_dir, options.words_file), '--train_file',
        os.path.join(options.working_dir, numberized_file)
    ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, 'src',
                         'createMmap'), '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
        ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, 'src',
                         'prepareNeuralLM'), '--train_text',
            options.validation_corpus, '--ngramize', '1', '--ngram_size',
            str(options.ngram_size), '--vocab_size',
            str(options.vocab_size), '--words_file',
            os.path.join(options.working_dir,
                         options.words_file), '--train_file',
            os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
        ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = os.path.join(options.working_dir,
                                            options.words_file)
    options.output_words_file = os.path.join(options.working_dir,
                                             options.words_file)
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    output_model_file = os.path.join(options.output_dir,
                                     options.output_model + '.model.nplm.best')
    if not os.path.exists(output_model_file):
        output_model_file = os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs))
    average_options = averageNullEmbedding.parser.parse_args([
        '-i',
        output_model_file,
        '-o',
        os.path.join(options.output_dir, options.output_model + '.model.nplm'),
        '-t',
        os.path.join(options.working_dir, numberized_file),
        '-p',
        os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)
Esempio n. 5
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    vocab_file =os.path.join(options.working_dir, options.words_file) 
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem,
        '--ngramize', '1',
        '--ngram_size', str(options.ngram_size),
        '--vocab_size', str(options.vocab_size),
        '--write_words_file', vocab_file,
        '--train_file', os.path.join(options.working_dir, numberized_file)
        ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    subprocess.check_call(extraction_cmd)

    # if dropout enabled, need to check which is the <null> vocab id
    null_id = None
    if options.dropout or options.input_dropout:
      with open(vocab_file) as vfh:
        for i,line in enumerate(vfh):
          if line[:-1].decode("utf8") == "<null>":
            null_id = i
            break
      if null_id == None:
        sys.stderr.write("WARN: could not identify null token, cannot enable dropout\n")
      else:
        if not options.extra_settings:
          options.extra_settings = ""
        if options.dropout or options.input_dropout:
          options.extra_settings += " --null_index %d " % null_id
        if options.dropout:
          options.extra_settings += " --dropout %s " % options.dropout
        if options.input_dropout:
          options.extra_settings += " --input_dropout %s " % options.input_dropout


    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, 'src', 'createMmap'),
            '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
            ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
            '--train_text', options.validation_corpus,
            '--ngramize', '1',
            '--ngram_size', str(options.ngram_size),
            '--vocab_size', str(options.vocab_size),
            '--words_file', vocab_file,
            '--train_file', os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
            ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = vocab_file
    options.output_words_file = vocab_file
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    output_model_file = os.path.join(
              options.output_dir,
              options.output_model + '.model.nplm.best')
    if not os.path.exists(output_model_file):
      output_model_file =  os.path.join(
              options.output_dir,
              options.output_model + '.model.nplm.' + str(options.epochs))
    average_options = averageNullEmbedding.parser.parse_args([
        '-i', output_model_file ,
        '-o', os.path.join(
            options.output_dir, options.output_model + '.model.nplm'),
        '-t', os.path.join(options.working_dir, numberized_file),
        '-p', os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)
Esempio n. 6
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem,
        '--ngramize', '1',
        '--ngram_size', str(options.ngram_size),
        '--vocab_size', str(options.vocab_size),
        '--write_words_file', os.path.join(
            options.working_dir, options.words_file),
        '--train_file', os.path.join(options.working_dir, numberized_file)
        ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, 'src', 'createMmap'),
            '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
            ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
            '--train_text', options.validation_corpus,
            '--ngramize', '1',
            '--ngram_size', str(options.ngram_size),
            '--vocab_size', str(options.vocab_size),
            '--words_file', os.path.join(
                options.working_dir, options.words_file),
            '--train_file', os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
            ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = os.path.join(options.working_dir, options.words_file)
    options.output_words_file = os.path.join(options.working_dir, options.words_file)
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    average_options = averageNullEmbedding.parser.parse_args([
        '-i', os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs)),
        '-o', os.path.join(
            options.output_dir, options.output_model + '.model.nplm'),
        '-t', os.path.join(options.working_dir, numberized_file),
        '-p', os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)