def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=512, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' feature = Feature(data=data_content, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = cnn_model(max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_filters=num_filters, filter_sizes=filter_sizes, num_classses=num_classes, dropout=dropout) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) cp = ModelCheckpoint(model_save_path, monitor='val_acc', verbose=1, save_best_only=True) # fit and save model history = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, validation_data=(X_val, y_val), callbacks=[cp]) logger.info('save model:%s' % model_save_path) plt_history(history, model_name=model_type)
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=2, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split(" ")) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' word_dic = {} count = 1 for word in word_vocab: word_dic[word] = count count += 1 data_filter = [] for line in data_content: line_filter = " ".join( list(filter(lambda x: x in word_dic, line.split(" ")))) data_filter.append(line_filter) feature = Feature(data=data_filter, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = load_model(model_save_path) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) #loss,accuracy = model.evaluate(X_val,y_val) #print loss,accuracy pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None) print(y_val) print(type(y_val)) with open("./output/result", "w") as f: for i in range(len(y_val)): f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2])) f.close()