X2.append(iclasses[label]) labels.append(0) categoricals = {"匹配":1, "不匹配":0} return X1, X2, labels, categoricals X1, X2, y, classes = convert_to_pairs(X, y, classes) X1_train = X1[:-1000] X2_train = X2[:-1000] y_train = y[:-1000] X1_test = X1[-1000:] X2_test = X2[-1000:] y_test = y[-1000:] num_classes = len(classes) tokenizer = SimpleTokenizer() tokenizer.fit(X1 + X2) X1_train = tokenizer.transform(X1_train) X2_train = tokenizer.transform(X2_train) maxlen = 48 hdims = 128 epochs = 2 X1_train = sequence.pad_sequences( X1_train, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0
from dataset import load_THUCNews_title_label from dataset import load_weibo_senti_100k from dataset import load_simplifyweibo_4_moods from dataset import load_simplifyweibo_3_moods from dataset import load_hotel_comment # 来自Transformer的激活函数,效果略有提升 def gelu(x): return 0.5 * x * (1.0 + tf.math.erf(x / tf.sqrt(2.0))) X, y, classes = load_hotel_comment() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=73672) class_weight = balance_class_weight(y_train) num_classes = len(classes) tokenizer = SimpleTokenizer(min_freq=32) tokenizer.fit(X_train) X_train = tokenizer.transform(X_train) maxlen = 48 maxlen = find_best_maxlen(X_train) X_train = sequence.pad_sequences( X_train, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0 ) y_train = tf.keras.utils.to_categorical(y_train, num_classes)