Esempio n. 1
0
from TextClassification import TextClassification, DataPreprocess
from sklearn.model_selection import train_test_split
from TextClassification import load_data
import numpy as np

# load data
data = load_data(name='single')
x = data['evaluation']
y = [[i] for i in data['label']]

# split train and test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# deal train
# ----------------------------------------
process = DataPreprocess()

# cut texts
X_train_cut = process.cut_texts(texts=X_train, need_cut=True, word_len=2, savepath=None)

# texts to sequence
X_train_seq = process.text2seq(texts_cut=X_train_cut, tokenizer=None, tokenizer_savapah=None,
                               num_words=500, maxlen=20, batchsize=10000)
# list to array
X_train_seq = np.array(X_train_seq)

# get tokenizer
tokenizer = process.tokenizer

# label to one-hot
label_set = process.creat_label_set(y_train)
Esempio n. 2
0
from TextClassification import TextClassification, DataPreprocess
from sklearn.model_selection import train_test_split
from TextClassification import load_data
import numpy as np

# load data
#-----------------------------------
data = load_data(name='single')
x = data['evaluation']
y = [[i] for i in data['label']]

# data process
#-----------------------------------
process = DataPreprocess()
# cut texts
x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None)
# texts to sequence
x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer, tokenizer_savapah=None,
                         num_words=num_words, maxlen=maxlen, batchsize=10000)
# list to array
x_seq = np.array(x_seq)

# texts to word vector
x_word_vec = model.text2vec(texts_cut=x, sg=1, size=128, window=5, min_count=1)
# texts vector
x_vec = np.array([sum(i) / len(i) for i in x_word_vec])

# single target

# train model
#------------------------------------
Esempio n. 3
0
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)

# 导入数据,拆分训练集和测试集
if os.path.exists("x_train.json"):
    print("data exists.")
    x_train = json.load(open("x_train.json", "r", encoding="utf8"))
    y_train = json.load(open("y_train.json", "r", encoding="utf8"))
    x_test = json.load(open("x_test.json", "r", encoding="utf8"))
    y_test = json.load(open("y_test.json", "r", encoding="utf8"))
else:
    data = load_data()
    x = [i['fact'] for i in data]
    y = [i['accusation'] for i in data]
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    json.dump(x_train, open("x_train.json", "w", encoding="utf8"))
    json.dump(x_test, open("x_test.json", "w", encoding="utf8"))
    json.dump(y_train, open("y_train.json", "w", encoding="utf8"))
    json.dump(y_test, open("y_test.json", "w", encoding="utf8"))

##### 以下是训练过程 #####

from TextClassification import TextClassification
Esempio n. 4
0
from TextClassification import load_data
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pickle
import numpy as np

sess = tf.InteractiveSession()

# 导入数据
data_type = 'multiple'
data = load_data(data_type)
x = [i['fact'] for i in data]
y = [i['accusation'] for i in data]

# 拆分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

##### 以下是训练过程 #####

from TextClassification import TextClassification

clf = TextClassification()
texts_seq, texts_labels = clf.get_preprocess(x_train,
                                             y_train,
                                             word_len=1,
                                             num_words=2000,
                                             sentence_len=50)
clf.fit(texts_seq, texts_labels, data_type, 3, 64)
Esempio n. 5
0
from TextClassification import TextClassification, DataPreprocess
from sklearn.model_selection import train_test_split
from TextClassification import load_data
import numpy as np

# load data
data = load_data(name='multiple')
x = [i['fact'] for i in data]
y = [i['accusation'] for i in data]

# split train and test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = TextClassification()

# train
model.fit(x=X_train,
          y=y_train,
          method='CNN',
          model=None,
          x_need_preprocess=True,
          y_need_preprocess=True,
          epochs=10,
          batchsize=128,
          output_type='multiple')

# predict
label_set = model.label_set
y_predict = model.predict(x=X_test, x_need_preprocess=True)
y_predict_label = model.label2tag(predictions=y_predict, labelset=label_set)
print(