def load_dataset(test_size=0.2): ## load word vector with utils.timer('Load word vector'): word2vec = tl.files.load_npy_to_any(name='%s/word2vec/w2v_sgns_%s_%s_%s.npy' % (config.ModelOutputDir, config.embedding_size, config.corpus_version, datestr)) ## load train data with utils.timer('Load train data'): data_1 = utils.load_cs_deleted_data(cs_delete_file_1) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data_4 = utils.load_cs_deleted_data(cs_delete_file_2) print(data_4['label'].value_counts()) data = pd.concat([data_1, data_2, data_3, data_4[data_4['label'] == 1].reset_index(drop=True)], axis=0,ignore_index=True) #data = pd.concat([data_1, data_2, data_3, data_4], axis=0,ignore_index=True) DebugDir = '%s/debug' % config.DataBaseDir if (os.path.exists(DebugDir) == False): os.makedirs(DebugDir) del data_4, data_3, data_2, data_1 gc.collect() ## data representation with utils.timer('representation for train'): # X = [[word2vec.get(w, word2vec['_UNK']) for w in utils.cut(text)] for text in data['text'].values] X = [] y = [] for i in range(len(data)): text = data['text'][i] if(text == ''): continue words = utils.cut(text) if(len(words) == 0): continue X.append([word2vec.get(w, word2vec['_UNK']) for w in words]) y.append(data['label'][i]) del word2vec, data gc.collect() X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size) return X_train, y_train, X_valid, y_valid
def LoadTrainData(): '''''' with utils.timer('Load train data'): data_1 = utils.load_cs_deleted_data(cs_delete_file_1) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data_4 = utils.load_cs_deleted_data(cs_delete_file_2) print(data_4['label'].value_counts()) data = pd.concat([data_1, data_2, data_3, data_4], axis=0, ignore_index=True) del data_4, data_3, data_2, data_1 #data = data[:int(0.6 * len(data))] gc.collect() return data
'subsample': .9, } if __name__ == '__main__': '''''' ## load word2vec lookup table with utils.timer('Load word vector'): word2vec = tl.files.load_npy_to_any( name='%s/model/word2vec_post_text_3d.npy' % config.DataBaseDir) ## load data with utils.timer('Load data'): data_1 = utils.load_cs_deleted_data(cs_delete_file) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data = pd.concat([data_1, data_2, data_3], axis=0, ignore_index=True) DebugDir = '%s/debug' % config.DataBaseDir if (os.path.exists(DebugDir) == False): os.makedirs(DebugDir) writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir) data.to_excel(writer, index=False) writer.close() del data_3, data_2, data_1 gc.collect() ## representation hit_words = []
def train_test_and_save_model(): ## load data with utils.timer('Load data'): data_1 = utils.load_cs_deleted_data(cs_delete_file) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data = pd.concat([data_1, data_2, data_3], axis= 0, ignore_index= True) DebugDir = '%s/debug' % config.DataBaseDir if(os.path.exists(DebugDir) == False): os.makedirs(DebugDir) #writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir) #data.to_excel(writer, index= False) #writer.close() del data_3, data_2, data_1 gc.collect() X_raw_words = data['text'].apply(utils.cut) uni_words = list(set([w for rec in X_raw_words for w in rec])) word_dict = dict(zip(uni_words, range(len(uni_words)))) X_words = [] for rec in X_raw_words: new_rec = [] for w in rec: new_rec.append(word_dict[w]) X_words.append(new_rec) # X_words = np.array(X_words) y = np.array(data['label']) if N_GRAM is not None: X_words = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n= N_GRAM) for x in X_words]) print(X_words.shape) print(y.shape) print(X_words[:5]) print(y[:5]) final_train_pred = np.zeros(len(X_words)) for s in range(config.train_times): s_start = time.time() train_pred = np.zeros(len(X_words)) classifier = FastTextClassifier( vocab_size=VOCAB_SIZE + N_BUCKETS, embedding_size=EMBEDDING_SIZE, n_labels=2, ) skf = StratifiedKFold(config.kfold, random_state=2018 * s, shuffle=False) for fold, (train_index, valid_index) in enumerate(skf.split(X_words, y)): X_train, X_valid = X_words[train_index], X_words[valid_index] y_train, y_valid = y[train_index], y[valid_index] with tf.Session() as sess: sess.run(tf.local_variables_initializer()) tl.layers.initialize_global_variables(sess) for epoch in range(N_EPOCH): start_time = time.time() print('Epoch %d/%d' % (epoch + 1, N_EPOCH)) for X_batch, y_batch in tl.iterate.minibatches(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True): sess.run( classifier.train_op, feed_dict={ classifier.inputs: tl.prepro.pad_sequences(X_batch), classifier.labels: y_batch, } ) valid_pred_proba = sess.run( classifier.prediction_probs, feed_dict={ classifier.inputs: tl.prepro.pad_sequences(X_valid) } )[:,1] valid_pred_label = utils.proba2label(valid_pred_proba) valid_auc = roc_auc_score(y_valid, valid_pred_proba) valid_precision = precision_score(y_valid, valid_pred_label) valid_recall = recall_score(y_valid, valid_pred_label) if(epoch == N_EPOCH - 1): train_pred[valid_index] = valid_pred_proba # valid_precision = sess.run( # classifier.precision, feed_dict={ # classifier.inputs: tl.prepro.pad_sequences(X_valid), # classifier.labels: y_valid, # } # ) # valid_recall = sess.run( # classifier.recall, feed_dict={ # classifier.inputs: tl.prepro.pad_sequences(X_valid), # classifier.labels: y_valid, # } # ) print('valid: auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (valid_auc, valid_precision, valid_recall, int(time.time() - start_time))) classifier.save(sess, MODEL_FILE_PATH) print('fold %s done!!!' % fold) auc = roc_auc_score(y, train_pred) precision = precision_score(y, utils.proba2label(train_pred)) recall = recall_score(y, utils.proba2label(train_pred)) print('auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (auc, precision, recall, int(time.time() - s_start)))