コード例 #1
0
def proc_parole_de(corpus_path, load_punkt_tokenizer, outf):
    punkt_tokenizer = load_punkt_tokenizer()

    apply_punkt_wrapper = parole.ApplyPunktWrapper(punkt_tokenizer, outf)

    parole.parole_crawl(corpus_path, apply_punkt_wrapper.apply_punkt,
                        DEBUG_SGM_LIMIT_PAROLE)
コード例 #2
0
def main(verbose=False, debug_sgm_limit=0):
    """Train the Punkt tokenizer on the German Parole corpus"""
    init_app('speech_sentences')

    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    config = load_config('.speechrc')

    parole_path = config.get("speech", "parole_de")

    logging.info("training punkt...")

    punkt_trainer = nltk.tokenize.punkt.PunktTrainer()

    train_punkt_wrapper = parole.TrainPunktWrapper(punkt_trainer)

    parole.parole_crawl(parole_path, train_punkt_wrapper.train_punkt,
                        debug_sgm_limit)

    logging.info("finalizing punkt training...")
    punkt_trainer.finalize_training(verbose=True)
    logging.info("punkt training done. %d text segments." %
                 train_punkt_wrapper.punkt_count)

    params = punkt_trainer.get_params()
    # print "Params: %s" % repr(params)

    parole.PUNKT_PICKLEFN.parent.mkdir(parents=True, exist_ok=True)
    tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(params)
    with open(str(parole.PUNKT_PICKLEFN), mode='wb') as f:
        pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

    logging.info('%s written.' % parole.PUNKT_PICKLEFN)
コード例 #3
0
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)


#
# main
#

logging.info("training punkt...")

punkt_trainer = nltk.tokenize.punkt.PunktTrainer()

train_punkt_wrapper = parole.TrainPunktWrapper(punkt_trainer)

parole.parole_crawl(parole_path, train_punkt_wrapper.train_punkt,
                    options.debug_sgm_limit)

logging.info("finalizing punkt training...")
punkt_trainer.finalize_training(verbose=True)
logging.info("punkt training done. %d text segments."
             % train_punkt_wrapper.punkt_count)

params = punkt_trainer.get_params()
# print "Params: %s" % repr(params)

mkdirs(os.path.dirname(parole.PUNKT_PICKLEFN))

tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(params)
with open(str(parole.PUNKT_PICKLEFN), mode='wb') as f:
        pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)