def train_model(training_data, validating_data, batch_size, max_pad):
    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)

    segmenter = CharacterSegmenter()
    train_corpus = SequencePairCorpus(source_with_unk=True, same_length=True)

    train_corpus.build(codecs.open(training_data, 'r', encoding="utf8"), segmenter, segmenter)
    logging.debug("Train corpus built : " + str(train_corpus.corpus_size()))

    unlabeled_tag_id = train_corpus.target_corpus.id("U")

    val_corpus = train_corpus.make(codecs.open(validating_data, 'r', encoding="utf8"), segmenter, segmenter)
    logging.debug("Validate corpus built")

    learning_param = LearnParam(
        num_epoch=25, learning_rate=0.05, momentum=0.0,
        batch_size=batch_size,
        max_pad = max_pad, device=None, nworker=None
    )



    lm = SequenceTaggingMachine(unlabeled_tag_id)


    logging.log(logging.INFO, "Begin to train ...")
    lm.train(train_corpus, val_corpus, learning_param)
Ejemplo n.º 2
0
def train_model(training_data, validating_data, batch_size, max_pad):
    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)

    segmenter = CharacterSegmenter()
    train_corpus = SequencePairCorpus(source_with_unk=True, same_length=True)

    train_corpus.build(codecs.open(training_data, 'r', encoding="utf8"),
                       segmenter, segmenter)
    logging.debug("Train corpus built : " + str(train_corpus.corpus_size()))

    unlabeled_tag_id = train_corpus.target_corpus.id("U")

    val_corpus = train_corpus.make(
        codecs.open(validating_data, 'r', encoding="utf8"), segmenter,
        segmenter)
    logging.debug("Validate corpus built")

    learning_param = LearnParam(num_epoch=25,
                                learning_rate=0.05,
                                momentum=0.0,
                                batch_size=batch_size,
                                max_pad=max_pad,
                                device=None,
                                nworker=None)

    lm = SequenceTaggingMachine(unlabeled_tag_id)

    logging.log(logging.INFO, "Begin to train ...")
    lm.train(train_corpus, val_corpus, learning_param)
Ejemplo n.º 3
0
def train_model(training_data, validating_data, batch_size, max_pad, dev, nworker):

    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)

    segmenter = CharacterSegmenter()
    corpus = SequencePairCorpus(source_with_unk=True, same_length=True)

    corpus.build(codecs.open(training_data, 'r', encoding = "utf8"), segmenter, segmenter)

    unlabeled_tag_id = corpus.target_corpus.id("U")

    problem = SequenceTaggingProblem(corpus)

    data_train = BucketIter(problem, batch_size, max_pad_num = max_pad)

    val_corpus = corpus.make(codecs.open(validating_data, 'r', encoding = "utf8"), segmenter, segmenter)
    val_problem = SequenceTaggingProblem(val_corpus)
    data_val = BucketIter(val_problem, batch_size, max_pad_num = max_pad)


    arch_param = ArchParam(
        num_hidden= 200,
        num_embed= 200,
        num_lstm_layer= 2,
        input_cell_num = corpus.source_cell_num(),
        output_cell_num= corpus.target_cell_num()
    )

    learning_param = LearnParam(
        num_epoch=25,learning_rate=0.05, momentum=0.0,
        batch_size = batch_size,
        max_pad = max_pad, device=dev, nworker = nworker
    )

    lm = PartialLabeledSenquenceTaggingModel(arch_param, unlabeled_tag_id)


    #lm.show_shape_info(data_train)
    logging.debug("O = {0}, S = {1}, B = {2}, I = {3}, E = {4} U = {5}".format(
        corpus.target_corpus.id("O"), corpus.target_corpus.id("S"),
        corpus.target_corpus.id("B"), corpus.target_corpus.id("I"), corpus.target_corpus.id("E"),
        corpus.target_corpus.id("U")))
 
    logging.log(logging.INFO, "Begin to train ...")
    lm.train(data_train, data_val, learning_param)