Beispiel #1
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=512,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = cnn_model(max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          num_filters=num_filters,
                          filter_sizes=filter_sizes,
                          num_classses=num_classes,
                          dropout=dropout)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    cp = ModelCheckpoint(model_save_path,
                         monitor='val_acc',
                         verbose=1,
                         save_best_only=True)
    # fit and save model
    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=nb_epoch,
                        validation_data=(X_val, y_val),
                        callbacks=[cp])
    logger.info('save model:%s' % model_save_path)
    plt_history(history, model_name=model_type)
Beispiel #2
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=2,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split(" "))

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'

    word_dic = {}
    count = 1
    for word in word_vocab:
        word_dic[word] = count
        count += 1
    data_filter = []
    for line in data_content:
        line_filter = " ".join(
            list(filter(lambda x: x in word_dic, line.split(" "))))
        data_filter.append(line_filter)
    feature = Feature(data=data_filter,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = load_model(model_save_path)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    #loss,accuracy = model.evaluate(X_val,y_val)
    #print loss,accuracy
    pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None)
    print(y_val)
    print(type(y_val))
    with open("./output/result", "w") as f:
        for i in range(len(y_val)):
            f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2]))
    f.close()