コード例 #1
0
def train(conf):
    data_train = pd.read_csv(conf['train_file'])
    data_val = pd.read_csv(conf['val_file'])
    processor = Processor(conf)
    processor.init(conf['w2v_path'])
    train_x = processor.get_features(data_train)
    val_x = processor.get_features(data_val)
    labels = conf['labels']
    grade2idx, idx2grade = grade_map(data_train[labels[0]].tolist())
    with codecs.open('./data/grade_idx.map', 'w') as f:
        json.dump(grade2idx, f)

    for label in labels:
        train_y = processor.get_labels(data_train, label, grade2idx)
        val_y = processor.get_labels(data_val, label, grade2idx)
        model = TextCNN(conf['num_class'], conf['seq_len'],
                        processor.to_embedding(), conf['num_filters'],
                        conf['filter_sizes']).model
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        mtr = Metrics()
        model_checkpoint = ModelCheckpoint(
            './save_model/{}.krs.save_model'.format(label),
            monitor='val_loss',
            verbose=1,
            save_best_only=True,
            mode='min')
        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=3,
                                       verbose=1,
                                       mode='min')

        model_summary = model.summary()
        logging.info(str(model_summary))
        logging.info('start train for label : {}'.format(label))
        history = model.fit(x=train_x,
                            y=train_y,
                            batch_size=256,
                            epochs=20,
                            verbose=1,
                            callbacks=[mtr, model_checkpoint, early_stopping],
                            validation_data=(val_x, val_y),
                            shuffle=True)
        logging.info('save_model train history for label : {}'.format(label))
        logging.info(str(history))
    logging.info('all labels model train finished')
コード例 #2
0
ファイル: run_cnn.py プロジェクト: CoderBinGe/user-portrait2
    f = StratifiedKFold(n_splits=n_splits, random_state=seed)
    for i, (tr, va) in enumerate(f.split(x_pad, y)):
        x_train_age = x_pad[tr]
        x_va_age = x_pad[va]
        y_train_age = y[tr]
        y_va_age = y[va]

        # 将整型标签转为onehot
        y_train_age = to_categorical(y_train_age)
        y_va_age = to_categorical(y_va_age)

        print('开始TextCNN建模......')
        max_features = len(word2index) + 1  # 词表的大小
        model = TextCNN(maxlen, max_features, embedding_dims, 7,
                        'softmax').get_model()
        # 指定optimizer、loss、评估标准
        model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

        print('训练...')
        my_callbacks = [
            ModelCheckpoint(model_path + 'cnn_model_age.h5', verbose=1),
            EarlyStopping(monitor='val_accuracy', patience=2, mode='max')
        ]
        # fit拟合数据
        history = model.fit(x_train_age,
                            y_train_age,
                            batch_size=batch_size,
                            epochs=epochs,
                            callbacks=my_callbacks,
                            validation_data=(x_va_age, y_va_age))
コード例 #3
0
ファイル: main.py プロジェクト: chlee95/test
            line = lines.strip().split()
            labels.append(int(line[0]))
            sentences.append(line[1:])
    return labels,sentences

train_labels,train_sentences = get_data(train_data_file)
test_labels, test_sentences = get_data(test_data_file)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences+test_sentences)
train_X = tokenizer.texts_to_sequences(train_sentences)
test_X = tokenizer.texts_to_sequences(test_sentences)
train_X = pad_sequences(train_X,maxlen=max_len)
test_X = pad_sequences(test_X,maxlen=max_len)
train_y = to_categorical(train_labels,num_classes=6)
test_y = to_categorical(test_labels,num_classes=6)

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split( )) for o in open(embedding_file, encoding='utf-8') if len(o) > 100)
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index)+1, 300))
for word,i in word_index.items():
    if word in embedding_index:
        embedding_matrix[i] = embedding_index[word]
    else:
        vec = np.random.uniform(-0.25,0.25,300)
        embedding_matrix[i] = vec

model = TextCNN(max_len, filter_sizes, num_filters, word_index,embedding_matrix).get_model()
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(x=train_X,y=train_y,batch_size=128,epochs=10,verbose=2)