from TextClassification import TextClassification, DataPreprocess from sklearn.model_selection import train_test_split from TextClassification import load_data import numpy as np # load data data = load_data(name='single') x = data['evaluation'] y = [[i] for i in data['label']] # split train and test X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # deal train # ---------------------------------------- process = DataPreprocess() # cut texts X_train_cut = process.cut_texts(texts=X_train, need_cut=True, word_len=2, savepath=None) # texts to sequence X_train_seq = process.text2seq(texts_cut=X_train_cut, tokenizer=None, tokenizer_savapah=None, num_words=500, maxlen=20, batchsize=10000) # list to array X_train_seq = np.array(X_train_seq) # get tokenizer tokenizer = process.tokenizer # label to one-hot label_set = process.creat_label_set(y_train)
from TextClassification import TextClassification, DataPreprocess from sklearn.model_selection import train_test_split from TextClassification import load_data import numpy as np # load data #----------------------------------- data = load_data(name='single') x = data['evaluation'] y = [[i] for i in data['label']] # data process #----------------------------------- process = DataPreprocess() # cut texts x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None) # texts to sequence x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer, tokenizer_savapah=None, num_words=num_words, maxlen=maxlen, batchsize=10000) # list to array x_seq = np.array(x_seq) # texts to word vector x_word_vec = model.text2vec(texts_cut=x, sg=1, size=128, window=5, min_count=1) # texts vector x_vec = np.array([sum(i) / len(i) for i in x_word_vec]) # single target # train model #------------------------------------
os.environ["CUDA_VISIBLE_DEVICES"] = "1" config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) # 导入数据,拆分训练集和测试集 if os.path.exists("x_train.json"): print("data exists.") x_train = json.load(open("x_train.json", "r", encoding="utf8")) y_train = json.load(open("y_train.json", "r", encoding="utf8")) x_test = json.load(open("x_test.json", "r", encoding="utf8")) y_test = json.load(open("y_test.json", "r", encoding="utf8")) else: data = load_data() x = [i['fact'] for i in data] y = [i['accusation'] for i in data] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) json.dump(x_train, open("x_train.json", "w", encoding="utf8")) json.dump(x_test, open("x_test.json", "w", encoding="utf8")) json.dump(y_train, open("y_train.json", "w", encoding="utf8")) json.dump(y_test, open("y_test.json", "w", encoding="utf8")) ##### 以下是训练过程 ##### from TextClassification import TextClassification
from TextClassification import load_data from sklearn.model_selection import train_test_split import tensorflow as tf import pickle import numpy as np sess = tf.InteractiveSession() # 导入数据 data_type = 'multiple' data = load_data(data_type) x = [i['fact'] for i in data] y = [i['accusation'] for i in data] # 拆分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) ##### 以下是训练过程 ##### from TextClassification import TextClassification clf = TextClassification() texts_seq, texts_labels = clf.get_preprocess(x_train, y_train, word_len=1, num_words=2000, sentence_len=50) clf.fit(texts_seq, texts_labels, data_type, 3, 64)
from TextClassification import TextClassification, DataPreprocess from sklearn.model_selection import train_test_split from TextClassification import load_data import numpy as np # load data data = load_data(name='multiple') x = [i['fact'] for i in data] y = [i['accusation'] for i in data] # split train and test X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) model = TextClassification() # train model.fit(x=X_train, y=y_train, method='CNN', model=None, x_need_preprocess=True, y_need_preprocess=True, epochs=10, batchsize=128, output_type='multiple') # predict label_set = model.label_set y_predict = model.predict(x=X_test, x_need_preprocess=True) y_predict_label = model.label2tag(predictions=y_predict, labelset=label_set) print(