def train(conf): data_train = pd.read_csv(conf['train_file']) data_val = pd.read_csv(conf['val_file']) processor = Processor(conf) processor.init(conf['w2v_path']) train_x = processor.get_features(data_train) val_x = processor.get_features(data_val) labels = conf['labels'] grade2idx, idx2grade = grade_map(data_train[labels[0]].tolist()) with codecs.open('./data/grade_idx.map', 'w') as f: json.dump(grade2idx, f) for label in labels: train_y = processor.get_labels(data_train, label, grade2idx) val_y = processor.get_labels(data_val, label, grade2idx) model = TextCNN(conf['num_class'], conf['seq_len'], processor.to_embedding(), conf['num_filters'], conf['filter_sizes']).model model.compile(loss='categorical_crossentropy', optimizer='adam') mtr = Metrics() model_checkpoint = ModelCheckpoint( './save_model/{}.krs.save_model'.format(label), monitor='val_loss', verbose=1, save_best_only=True, mode='min') early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min') model_summary = model.summary() logging.info(str(model_summary)) logging.info('start train for label : {}'.format(label)) history = model.fit(x=train_x, y=train_y, batch_size=256, epochs=20, verbose=1, callbacks=[mtr, model_checkpoint, early_stopping], validation_data=(val_x, val_y), shuffle=True) logging.info('save_model train history for label : {}'.format(label)) logging.info(str(history)) logging.info('all labels model train finished')
f = StratifiedKFold(n_splits=n_splits, random_state=seed) for i, (tr, va) in enumerate(f.split(x_pad, y)): x_train_age = x_pad[tr] x_va_age = x_pad[va] y_train_age = y[tr] y_va_age = y[va] # 将整型标签转为onehot y_train_age = to_categorical(y_train_age) y_va_age = to_categorical(y_va_age) print('开始TextCNN建模......') max_features = len(word2index) + 1 # 词表的大小 model = TextCNN(maxlen, max_features, embedding_dims, 7, 'softmax').get_model() # 指定optimizer、loss、评估标准 model.compile('adam', 'categorical_crossentropy', metrics=['accuracy']) print('训练...') my_callbacks = [ ModelCheckpoint(model_path + 'cnn_model_age.h5', verbose=1), EarlyStopping(monitor='val_accuracy', patience=2, mode='max') ] # fit拟合数据 history = model.fit(x_train_age, y_train_age, batch_size=batch_size, epochs=epochs, callbacks=my_callbacks, validation_data=(x_va_age, y_va_age))
labels.append(int(line[0])) sentences.append(line[1:]) return labels,sentences train_labels,train_sentences = get_data(train_data_file) test_labels, test_sentences = get_data(test_data_file) tokenizer = Tokenizer() tokenizer.fit_on_texts(train_sentences+test_sentences) train_X = tokenizer.texts_to_sequences(train_sentences) test_X = tokenizer.texts_to_sequences(test_sentences) train_X = pad_sequences(train_X,maxlen=max_len) test_X = pad_sequences(test_X,maxlen=max_len) train_y = to_categorical(train_labels,num_classes=6) test_y = to_categorical(test_labels,num_classes=6) def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') embedding_index = dict(get_coefs(*o.strip().split( )) for o in open(embedding_file, encoding='utf-8') if len(o) > 100) word_index = tokenizer.word_index embedding_matrix = np.zeros((len(word_index)+1, 300)) for word,i in word_index.items(): if word in embedding_index: embedding_matrix[i] = embedding_index[word] else: vec = np.random.uniform(-0.25,0.25,300) embedding_matrix[i] = vec model = TextCNN(max_len, filter_sizes, num_filters, word_index,embedding_matrix).get_model() model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy']) model.fit(x=train_X,y=train_y,batch_size=128,epochs=10,verbose=2)