n=0 colors = cm.get_cmap('tab20') means = [] comps = [] whitegames = [] for game in gamelist: #white = ' '.join(i for i in game.split(' ')[:openinglength*2:2]) white = ' '.join('white' + i.replace('x','') if j%2==0 else 'black' + i.replace('x','') for j,i in enumerate(game.split(' ')[:40])) whitegames.append(white) tokeniser = text.Tokenizer(filters='!"#$%&()*+,./:;<>?@[\\]^_`{|}~\t\n') tokeniser.fit_on_texts(whitegames) labels = y idx_word = tokeniser.index_word idx_word[0] = '' for openinglength in range(8,42,2): n+=1 whitegames = [] for game in gamelist: #white = ' '.join(i for i in game.split(' ')[:openinglength*2:2]) white = ' '.join('white' + i.replace('x','') if j%2==0 else 'black' + i.replace('x','') for j,i in enumerate(game.split(' ')[:openinglength]))
max_features = 20000 maxlen = 100 train = pd.read_csv("./data/train.csv") test = pd.read_csv("./data/test.csv") train = train.sample(frac=1) list_sentences_train = train["comment_text"].fillna("CVxTz").values list_classes = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] y = train[list_classes].values list_sentences_test = test["comment_text"].fillna("CVxTz").values tokenizer = text.Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(list_sentences_train)) list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test) X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen) X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen) def get_model(): embed_size = 128 inp = Input(shape=(maxlen, )) x = Embedding(max_features, embed_size)(inp) x = Bidirectional(LSTM(50, return_sequences=True))(x) x = GlobalMaxPool1D()(x) x = Dropout(0.1)(x) x = Dense(50, activation="relu")(x)
def text_preprocess(): print('read data...') train_ori, test_ori = _read_data('train.pd', 'test.pd') print('remove patterns...') train_ori['text_list'] = train_ori['text_list'].apply( lambda list: _remove_pattern_2(list)) test_ori['text_list'] = test_ori['text_list'].apply( lambda list: _remove_pattern_2(list)) print('shuffle...') # train_ori.sample(frac=1).reset_index(drop=True) # test_ori.sample(frac=1).reset_index(drop=True) train_ori = train_ori.iloc[np.random.permutation( len(train_ori))] # 手动shuffle test_ori = test_ori.iloc[np.random.permutation(len(test_ori))] print('join text list...') train_text = train_ori['text_list'].apply(lambda list: " ".join(list)) test_text = test_ori['text_list'].apply(lambda list: " ".join(list)) # train_text = train_ori['text_list'] # test_text = test_ori['text_list'] print('prepare labels...') Y_train = train_ori['label'].apply(lambda gender: 1 if gender == 'male' else 0) Y_test = test_ori['label'].apply(lambda gender: 1 if gender == 'male' else 0) print('prepare tokenizer') tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE) #词汇表最多单词数 tokenizer.fit_on_texts( list(train_text) + list(test_text) ) #Updates internal vocabulary based on a list of texts. # train_seq = tokenizer.texts_to_sequences(train_text)#Transforms each text in texts in a sequence of integers # test_seq = tokenizer.texts_to_sequences(test_text) # # X_train = sequence.pad_sequences(train_seq, maxlen=MAXLEN)#Pads each sequence to the same length (length of the longest sequence) # X_test = sequence.pad_sequences(test_seq, maxlen=MAXLEN) # print(X_train.shape) # print(X_test.shape) print('texts to sequences...') train_ori['seq'] = train_ori['text_list'].apply( lambda list: tokenizer.texts_to_sequences(list)) test_ori['seq'] = test_ori['text_list'].apply( lambda list: tokenizer.texts_to_sequences(list)) print('pad sequences...') train_ori['seq'] = train_ori['seq'].apply( lambda list: sequence.pad_sequences(list, maxlen=MAXLEN)) test_ori['seq'] = test_ori['seq'].apply( lambda list: sequence.pad_sequences(list, maxlen=MAXLEN)) # X_train = sequence.pad_sequences(train_ori['seq'], maxlen=MAXLEN)#Pads each sequence to the same length (length of the longest sequence) # X_test = sequence.pad_sequences(test_ori['seq'], maxlen=MAXLEN) print('fit to numpy...') X_train = np.array(list(train_ori['seq'])) X_test = np.array(list(test_ori['seq'])) print(X_train.shape) print(X_test.shape) return X_train, Y_train, X_test, Y_test, tokenizer
x_train = generate_new_comments(comments_train) y_train = traindata['target'] y_train = np.rint(np.array(y_train)) y_aux_train = traindata[[ 'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]] x_test = generate_new_comments(comments_test) y_test = testdata['toxicity'] y_test = np.rint(np.array(y_test)) # Tokenize sentenses and index the words, padding each sequence to the same length 220 MAX_LEN = 220 tokenizer = text.Tokenizer() tokenizer.fit_on_texts(x_train + x_test) x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN) x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN) print(len(tokenizer.word_index)) # build Glove dictionary def load_emb_data(emb_path): emb = {} with open(emb_path, 'r', encoding='UTF-8') as fin: while True: line = fin.readline()
"review/time", "review/summary", "review/text" ] df = pd.read_csv("data/finemuged.csv", encoding='latin1', header=None, names=colnames, quotechar="\"").sample(100000) def one_hot(x, maxi=5): arr = [0] * maxi arr[x - 1] = 1 return arr t = text.Tokenizer(10000) X = df["review/text"].values t.fit_on_texts(X) X = t.texts_to_sequences(X) X = sequence.pad_sequences(X, value=0, padding='post', maxlen=256) y = df["review/score"].astype(int).values.reshape(-1) - 1 y = np.eye(5)[y] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3) # print(X_train.shape, y_train.shape) # print(X_test.shape, y_test.shape) # exit(0) sequence_length = 256 vocabulary_size = 10000 embedding_dim = 300
def preprocess_data(max_features=100000, maxlen=200, embed_size=300): #load and clean data train = pd.read_csv(TRAIN_FILE) train = train.drop(['id'], axis=1) train['tags'] = train['tags'].astype(str) train['article'] = train['article'].str.replace( '</p>|<p>|\r|\n|<br>|</p>|<pre>|</pre>|<code>|</code>', '') train['combined'] = train['title'] + ' ' + train['article'] train.drop(['title', 'article'], axis=1, inplace=True) lst = [x.split(',') for x in train['tags'].str.replace('|', ',').tolist()] #one hot encode label (multi) mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(lst) with open('multi_label_binarizer.pickle', 'wb') as handle: pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL) print("binarizer saved") del lst test = pd.read_csv(TEST_FILE) test = test.drop(['id'], axis=1) test['article'] = test['article'].str.replace( '</p>|<p>|\r|\n|<br>|</p>|<pre>|</pre>|<code>|</code>', '') test['combined'] = test['title'] + ' ' + test['article'] test.drop(['title', 'article'], axis=1, inplace=True) X_train = train["combined"].fillna("fillna").values y_train = y #train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values del y X_test = test["combined"].fillna("fillna").values tokenizer = text.Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) print("tokenizer saved") X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) x_train = sequence.pad_sequences(X_train, maxlen=maxlen) x_test = sequence.pad_sequences(X_test, maxlen=maxlen) gc.collect() del X_train, X_test embeddings_index = dict( get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8")) all_embs = np.stack(embeddings_index.values()) emb_mean, emb_std = all_embs.mean(), all_embs.std() del all_embs gc.collect() word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) for word, i in word_index.items(): if i >= max_features: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector print('preprocessing done') return embedding_matrix, x_train, y_train, x_test, mlb
from keras.layers.recurrent import LSTM, GRU from keras.layers.normalization import BatchNormalization from keras.utils import np_utils from keras.layers import Merge from keras.layers import TimeDistributed, Lambda from keras.layers import Convolution1D, GlobalMaxPooling1D from keras.callbacks import ModelCheckpoint from keras import backend as K from keras.layers.advanced_activations import PReLU from keras.preprocessing import sequence, text from keras.layers import SpatialDropout1D data = pd.read_csv('train.csv') y = data.is_duplicate.values tk = text.Tokenizer(num_words=200000) max_len = 40 tk.fit_on_texts( list(data.question1.values) + list(data.question2.values.astype(str))) x1 = tk.texts_to_sequences(data.question1.values) x1 = sequence.pad_sequences(x1, maxlen=max_len) x2 = tk.texts_to_sequences(data.question2.values.astype(str)) x2 = sequence.pad_sequences(x2, maxlen=max_len) # Dividing the datasets into train and test splits from sklearn.model_selection import train_test_split x1_train, x1_test, y_train, y_test = train_test_split(x1, y, test_size=0.2) x2_train, x2_test, _, _ = train_test_split(x2, y, test_size=0.2)
def index(): if request.method == 'GET': return render_template("index.html") elif request.method == 'POST': model = load_model(modelPath, custom_objects={'Attention': Attention}) search_text = request.form['query'] try: response = es_client.search(index=ES_settings['ES_index'], body={ "query": { "match": { "doc": search_text } }, "size": 100 }) except: jsonrespons = [] return render_template("index.html") if response['timed_out'] == True: jsonresponse = [] return render_template("index.html") else: search_results = response['hits']['hits'] jsonresponse = [] urllist = [] for X in search_results: plaintext = X['_source']['doc'] url = X['_source']['url'] tok_sentence = sent_tokenize(plaintext) jsonresponse += tok_sentence tok_sentence_lenght = len(tok_sentence) urllist += tok_sentence_lenght * [url] max_features = 100000 maxlen = 150 embed_size = 300 tok = text.Tokenizer(num_words=max_features, lower=True) total_sentence = len(jsonresponse) tok.fit_on_texts(jsonresponse + [search_text] * total_sentence) word_index = tok.word_index X_test = tok.texts_to_sequences(jsonresponse) topic_test = tok.texts_to_sequences([search_text] * total_sentence) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) topic_test = sequence.pad_sequences(topic_test, maxlen=maxlen) del tok gc.collect() embedding_matrix = build_matrix(word_index, embeddings_index) y_pred = model.predict([X_test, topic_test], verbose=1, batch_size=512) K.clear_session() index = np.arange(len(y_pred)) y_pred = np.c_[index, y_pred] ratings = pd.DataFrame({ 'index': y_pred[:, 0], 'no_Argument': y_pred[:, 1], 'Argument_for': y_pred[:, 2], 'Argument_against': y_pred[:, 3] }) ratings['res'] = ratings.apply(lambda x: compare( x['no_Argument'], x['Argument_for'], x['Argument_against']), axis=1) sortedforratings = ratings[ratings['res'] == 'Argument_for'].sort_values( by='Argument_for', ascending=False) sortedforratings['text'] = sortedforratings.apply( lambda x: jsonresponse[int(x['id'])], axis=1) sortedforratings['link'] = sortedforratings.apply( lambda x: urllist[int(x['id'])], axis=1) sortedforratings['Argument_for'] = sortedforratings[ 'Argument_for'].apply(lambda x: truncted_float(x)) sortedAgainstRatings = ratings[ratings['res'] == 'Argument_against'].sort_values( by='Argument_against', ascending=False) sortedAgainstRatings['Argument_against'] = sortedAgainstRatings[ 'Argument_against'].apply(lambda x: truncted_float(x)) sortedAgainstRatings['text'] = sortedAgainstRatings.apply( lambda x: jsonresponse[int(x['id'])], axis=1) sortedAgainstRatings['link'] = sortedAgainstRatings.apply( lambda x: urllist[int(x['id'])], axis=1) return render_template("result.html", for_output=sortedforratings, against_outout=sortedAgainstRatings)
def create_train_data(self): # 读取输入输出 train_comments = self.train_df["comment_text"].astype(str) train_label = self.train_df["target"].values train_type_labels = self.train_df[self.toxicity_type_list].values # 身份原始值 train_identity_values = self.train_df[self.identity_list].fillna( 0.).values # 所有身份原始值之和 train_identity_sum = train_identity_values.sum(axis=1) # 将身份之和限制在1以下(sigmoid) train_identity_sum_label = np.where(train_identity_sum > 1, 1, train_identity_sum) # 身份01值 train_identity_binary = copy.deepcopy( self.train_df[self.identity_list]) for column in self.identity_list: train_identity_binary[column] = np.where( train_identity_binary[column] > 0.5, 1, 0) # 身份01值有一个就算1 train_identity_binary_sum = train_identity_binary.sum(axis=1) train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1, 0) # 所有身份标签 train_identity_type_labels = train_identity_values train_identity_type_binary_lables = train_identity_binary train_identity_sum_label = train_identity_sum_label train_identity_binary_label = train_identity_or_binary test_comments = self.test_df["comment_text"].astype(str) # tokenizer 训练 tokenizer = text.Tokenizer(filters=self.stopwords) tokenizer.fit_on_texts( list(train_comments) + list(test_comments) ) # train_comments 是 dataframe 的一列,是 Series 类, list(train_comments) 直接变成 list # tokenization train_tokens = tokenizer.texts_to_sequences( train_comments) # 可以给 Series 也可以给 list? test_tokens = tokenizer.texts_to_sequences(test_comments) # 用 sequence 类补到定长 train_tokens = sequence.pad_sequences(train_tokens, maxlen=self.max_len) test_tokens = sequence.pad_sequences(test_tokens, maxlen=self.max_len) # 划分训练集和验证集 valid_tokens = train_tokens[self.train_len:] valid_label = train_label[self.train_len:] valid_type_labels = train_type_labels[self.train_len:] train_tokens = train_tokens[:self.train_len] train_label = train_label[:self.train_len] train_type_labels = train_type_labels[:self.train_len] # 划分身份标签 valid_identity_type_labels = train_identity_type_labels[self. train_len:] train_identity_type_labels = train_identity_type_labels[:self. train_len] valid_identity_type_binary_lables = train_identity_type_binary_lables[ self.train_len:] train_identity_type_binary_lables = train_identity_type_binary_lables[: self . train_len] valid_identity_sum_label = train_identity_sum_label[self.train_len:] train_identity_sum_label = train_identity_sum_label[:self.train_len] valid_identity_binary_label = train_identity_binary_label[self. train_len:] train_identity_binary_label = train_identity_binary_label[:self. train_len] # 数据集 dataset = { "train_tokens": train_tokens, "train_label": train_label, "train_type_labels": train_type_labels, "valid_tokens": valid_tokens, "valid_label": valid_label, "valid_type_labels": valid_type_labels, "test_tokens": test_tokens, "tokenizer": tokenizer, "valid_identity_type_labels": valid_identity_type_labels, "train_identity_type_labels": train_identity_type_labels, "valid_identity_type_binary_lables": valid_identity_type_binary_lables, "train_identity_type_binary_lables": train_identity_type_binary_lables, "valid_identity_sum_label": valid_identity_sum_label, "train_identity_sum_label": train_identity_sum_label, "valid_identity_binary_label": valid_identity_binary_label, "train_identity_binary_label": train_identity_binary_label } return dataset
def trainModel(model, inputs, vocabSize, ansLen=100, batch_size=32, epochs=200, validation_split=0.2, fileName="summarizer", cv=False, es=False, halfInpCV=False): """ Trains the model * inputs should be a list of answer groups. Each answer group should be represented by a list of strings where the first string is the accepted answer returns: model and tokenizer """ print("Creating Tokenizer") tok = text.Tokenizer(vocabSize - 1, lower=False, oov_token="UNK") tok.fit_on_texts(ans for ansGroup in inputs for ans in ansGroup) fout = open(fileName + '.tok', 'wb') pickle.dump(tok.to_json(), fout) fout.close() print("Preparing training data") inputAns = [] outputAns = [] for ansGroup in tqdm(inputs): numAnswers = len(ansGroup) - 1 tokAns = tok.texts_to_sequences(ansGroup) #restrict to 100 tokens from each anwer and concatenate input answers together inp = [w for seq in tokAns[1:] for w in seq[:ansLen]] outp = tokAns[0][:ansLen] inputAns.append(inp) outputAns.append(outp) print("Padding/trimming inputs") inputAns = sequence.pad_sequences(inputAns, maxlen=ansLen * numAnswers, padding="post", truncating="post") outputAns = sequence.pad_sequences(outputAns, maxlen=ansLen, padding="post", truncating="post") def f(i): x = [0] * vocabSize x[i] = 1 return x print("Finalizing training output") outNP = zeros((len(outputAns), ansLen, vocabSize)) for i, doc in enumerate(tqdm(outputAns)): for j, word in enumerate(doc): outNP[i][j][word] = 1 outputAns = outNP if (cv): print("Performing Cross-Validation") factor = 2 if halfInpCV else 1 scores = crossValidate(model.to_json(), inputAns[:int(len(inputAns) / factor)], outputAns[:int(len(outputAns) / factor)]) print(scores) scoreFile = open(fileName + ".cvscores", "wb") pickle.dump(scores, scoreFile) scoreFile.close() print("Training Model") callbacks = [ ModelCheckpoint(fileName + "{epoch:02d}_{loss:.2f}_{val_loss:.2f}.model", verbose=1, period=5) ] if (es): callbacks.append( EarlyStopping(monitor="val_loss", patience=2, verbose=1, mode="min", restore_best_weights=True)) model.fit(inputAns, outputAns, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=validation_split, callbacks=callbacks) return model, tok
return model train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv') test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv') x_train = train_df[TEXT_COLUMN].astype(str) y_train = train_df[TARGET_COLUMN].values y_aux_train = train_df[AUX_COLUMNS].values x_test = test_df[TEXT_COLUMN].astype(str) for column in IDENTITY_COLUMNS + [TARGET_COLUMN]: train_df[column] = np.where(train_df[column] >= 0.5, True, False) tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False) tokenizer.fit_on_texts(list(x_train) + list(x_test)) x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN) x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN) sample_weights = np.ones(len(x_train), dtype=np.float32) sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1) sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1) sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5 sample_weights /= sample_weights.mean() embedding_matrix = np.concatenate( [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)
def create_tokenizer(lines): tokenizer = text.Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer
(X_train, y_train), (X_test, y_test) = imdb.load_imdb() # set parameters: vocab_size = 1000 maxlen = 300 batch_size = 32 embedding_dims = 50 filters = 10 kernel_size = 3 hidden_dims = 10 epochs = 10 # Use tokenization i.e to convert to matrix/vector tokenizer = text.Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_matrix(X_train) X_test = tokenizer.texts_to_matrix(X_test) # Padding X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) # Building model( Embedding+CNN 1D+LSTM) model = Sequential() model.add(Embedding(vocab_size, embedding_dims, input_length=maxlen)) model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu')) model.add(MaxPooling1D()) # to down-sample an input representation
train_text = [] for t in train['comment_text'].fillna('foobar'): t = pre_process(t) train_text.append(t) train_text = pd.Series(train_text).astype(str) test_text = [] for t in test['comment_text'].fillna('foobar'): t = pre_process(t) test_text.append(t) test_text = pd.Series(test_text).astype(str) dict_size = 50000 max_len = 200 tokenizer = text.Tokenizer(num_words=dict_size) tokenizer.fit_on_texts(list(train_text) + list(test_text)) train_seq = tokenizer.texts_to_sequences(train_text) test_seq = tokenizer.texts_to_sequences(test_text) train_X = sequence.pad_sequences(train_seq, maxlen=max_len) train_Y = train[list(train)[2:]].values # list to get column names test_X = sequence.pad_sequences(test_seq, maxlen=max_len) # Persistence pd.DataFrame(train_X).to_csv(path_prefix + 'train_X.csv', header=False, index=False) pd.DataFrame(test_X).to_csv(path_prefix + 'test_X.csv', header=False, index=False) print('Finished!') # The model print("Building and Training a model ...", end=' ') embed_size = 100
torch.backends.cudnn.deterministic=True corpus=[] file=open('OLID/olid-training-v1.0.tsv','r',encoding='UTF-8') y_train=[] for i in file: if(len(corpus)==10):print(corpus[-1]) a=i.split('\t') corpus.append(a[1].lower()) if a[2]=="OFF":y_train.append(1) else:y_train.append(0) file.close() #print(corpus[1:4]) tokenizer = text.Tokenizer(num_words = 1000) tokenizer.fit_on_texts(corpus) word_index = tokenizer.word_index x_train = tokenizer.texts_to_sequences(corpus) x_train = sequence.pad_sequences(x_train, maxlen = 30) EMBEDDING_FILE = glove_para embeddings_index = {} for i, line in enumerate(open(EMBEDDING_FILE,encoding="utf-8")): val = line.split() embeddings_index[val[0]] = np.asarray(val[1:], dtype='float32')
def generateOOVEmbeddings(): # read the (DL cleaned) dataset and build the vocabulary print('loading dataframes...') train_df = pd.read_csv('../data/training/train2.cleaned.dl.csv') test_df = pd.read_csv('../data/eval/test2.cleaned.dl.csv') # ps: forget memory and runtime, it's python here :D list_sentences_train = train_df["comment_text"].values list_sentences_test = test_df["comment_text"].values list_sentences_all = np.concatenate( [list_sentences_train, list_sentences_test]) tokenizer = text.Tokenizer(num_words=400000) tokenizer.fit_on_texts(list(list_sentences_all)) print('word_index size:', len(tokenizer.word_index), 'words') word_index = tokenizer.word_index # load fastText - only the words print('loading fastText embeddings...') voc = set() f = open('/mnt/data/wikipedia/embeddings/crawl-300d-2M.vec') begin = True for line in f: if begin: begin = False else: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('fastText embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('fastText embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-fastText.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close() # load gloves - only the words print('loading gloves embeddings...') voc = set() f = open('/mnt/data/wikipedia/embeddings/glove.840B.300d.txt') for line in f: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('gloves embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('gloves embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-gloves.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close() # load word2vec - only the words print('loading word2vec embeddings...') voc = set() f = open( '/mnt/data/wikipedia/embeddings/GoogleNews-vectors-negative300.vec') begin = True for line in f: if begin: begin = False else: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('word2vec embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('word2vec embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-w2v.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close() # load numberbatch - only the words print('loading numberbatch embeddings...') voc = set() f = open('/mnt/data/wikipedia/embeddings/numberbatch-en-17.06.txt') begin = True for line in f: if begin: begin = False else: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('numberbatch embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('numberbatch embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-numberbatch.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close()
def train(): all_data = np.load('./temp/train_data.npy') all_label = np.load('./temp/label_v.npy') ##### generate label ##### print('label generate start') tokenizer = text.Tokenizer(filters='\n') tokenizer.fit_on_texts(all_label) all_label = tokenizer.texts_to_sequences(all_label) all_label = np.array(all_label, dtype='int') all_label = all_label - 1 all_label = np_utils.to_categorical(all_label) print('label generate end') ##### generate label ##### ##### suffle ##### all_data = all_data.reshape(len(all_data), width * 2, n_fil, 1) index = list(range(0, len(all_data))) np.random.seed(1024) np.random.shuffle(index) all_data = all_data[index] all_label = all_label[index] ##### suffle ##### ##### model structure ##### ##### layer1 ##### model = Sequential() model.add( Conv2D(filters=48, kernel_size=(3, 3), input_shape=(width * 2, n_fil, 1), padding='same')) model.add(Activation('relu')) model.add(Conv2D(filters=48, kernel_size=(3, 3), padding='same')) model.add(Activation('relu')) model.add(Conv2D(filters=48, kernel_size=(3, 3), padding='same')) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) ##### layer1 ##### ##### layer2 ##### model.add(Conv2D(filters=96, kernel_size=(3, 3), padding='same')) model.add(Activation('relu')) model.add(Conv2D(filters=96, kernel_size=(3, 3), padding='same')) model.add(Activation('relu')) model.add(Dropout(rate=0.3)) model.add(MaxPooling2D(pool_size=(2, 2))) ##### layer2 ##### ##### layer3 ##### model.add(Conv2D(filters=192, kernel_size=(3, 3), padding='same')) model.add(Activation('relu')) model.add(Conv2D(filters=192, kernel_size=(3, 3), padding='same')) model.add(Activation('relu')) model.add(Dropout(rate=0.3)) model.add(MaxPooling2D(pool_size=(2, 2))) ##### layer3 ##### model.add(Flatten()) model.add(Dense(1024)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(512)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(41, activation='softmax')) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) ##### model structure ##### history = model.fit(all_data, all_label, batch_size=200, epochs=15, verbose=1, validation_split=0.05, shuffle=True, initial_epoch=0) model.save('reproduce.h5') plt.plot(history.history['acc']) plt.plot(history.history['val_acc'])
def create_dataloader(self): # 读取输入输出 train_comments = self.train_df["comment_text"].astype(str) train_label = self.train_df["target"].values train_type_labels = self.train_df[self.toxicity_type_list].values # 新的 np 任务 train_np_labels = np.zeros((len(self.train_df), 4)) train_np_identity_labels = np.zeros( (len(self.train_df), len(self.identity_list) * 4)) train_df_copy = self.train_df[self.identity_list + ["target"]] for column in self.identity_list + ["target"]: train_df_copy[column] = np.where(train_df_copy[column] > 0.5, True, False) pp_label_bool = train_df_copy["target"] & np.where( train_df_copy[self.identity_list].sum(axis=1) > 0, True, False) np_label_bool = ~train_df_copy["target"] & np.where( train_df_copy[self.identity_list].sum(axis=1) > 0, True, False) pn_label_bool = train_df_copy["target"] & np.where( (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False) nn_label_bool = ~train_df_copy["target"] & np.where( (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False) train_np_labels[:, 0] = np.where(pp_label_bool > 0, 1, 0) train_np_labels[:, 1] = np.where(np_label_bool > 0, 1, 0) train_np_labels[:, 2] = np.where(pn_label_bool > 0, 1, 0) train_np_labels[:, 3] = np.where(nn_label_bool > 0, 1, 0) for i, column in enumerate(self.identity_list): pp_label_bool = train_df_copy["target"] & train_df_copy[column] np_label_bool = ~train_df_copy["target"] & train_df_copy[column] pn_label_bool = train_df_copy["target"] & (~train_df_copy[column]) nn_label_bool = ~train_df_copy["target"] & (~train_df_copy[column]) train_np_identity_labels[:, i * 4 + 0] = np.where( pp_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 1] = np.where( np_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 2] = np.where( pn_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 3] = np.where( nn_label_bool > 0, 1, 0) # 身份原始值 train_identity_values = self.train_df[self.identity_list].fillna( 0.).values # 所有身份原始值之和 train_identity_sum = train_identity_values.sum(axis=1) # 将身份之和限制在1以下(sigmoid) train_identity_sum_label = np.where(train_identity_sum > 1, 1, train_identity_sum) # 身份01值 train_identity_binary = copy.deepcopy( self.train_df[self.identity_list]) for column in self.identity_list: train_identity_binary[column] = np.where( train_identity_binary[column] > 0.5, 1, 0) # 身份01值有一个就算1 train_identity_binary_sum = train_identity_binary.sum(axis=1) train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1, 0) # 所有身份标签 train_identity_type_labels = train_identity_values train_identity_type_binary_lables = train_identity_binary train_identity_sum_label = train_identity_sum_label train_identity_binary_label = train_identity_or_binary # tokenizer 训练 test_comments = self.test_df["comment_text"].astype(str) tokenizer = text.Tokenizer(filters=self.stopwords) tokenizer.fit_on_texts( list(train_comments) + list(test_comments) ) # train_comments 是 dataframe 的一列,是 Series 类, list(train_comments) 直接变成 list # tokenization train_tokens = tokenizer.texts_to_sequences( train_comments) # 可以给 Series 也可以给 list? test_tokens = tokenizer.texts_to_sequences(test_comments) # 用 sequence 类补到定长 train_tokens = sequence.pad_sequences(train_tokens, maxlen=self.max_len) test_tokens = sequence.pad_sequences(test_tokens, maxlen=self.max_len) # 划分训练集和验证集 valid_tokens = train_tokens[self.train_len:] valid_label = train_label[self.train_len:] valid_type_labels = train_type_labels[self.train_len:] train_tokens = train_tokens[:self.train_len] train_label = train_label[:self.train_len] train_type_labels = train_type_labels[:self.train_len] valid_identity_type_labels = train_identity_type_labels[self. train_len:] train_identity_type_labels = train_identity_type_labels[:self. train_len] valid_identity_type_binary_lables = train_identity_type_binary_lables[ self.train_len:] train_identity_type_binary_lables = train_identity_type_binary_lables[: self . train_len] valid_identity_sum_label = train_identity_sum_label[self.train_len:] train_identity_sum_label = train_identity_sum_label[:self.train_len] valid_identity_binary_label = train_identity_binary_label[self. train_len:] train_identity_binary_label = train_identity_binary_label[:self. train_len] valid_np_labels = train_np_labels[self.train_len:] train_np_labels = train_np_labels[:self.train_len] valid_np_identity_labels = train_np_identity_labels[self.train_len:] train_np_identity_labels = train_np_identity_labels[:self.train_len] # 计算样本权重 target_weight, aux_weight, identity_weight, np_weight, np_identity_weight = self.cal_sample_weights( ) #train_np_labels #train_np_identity_labels # 将符号化数据转成 tensor train_x_tensor = torch.tensor(train_tokens, dtype=torch.long) valid_x_tensor = torch.tensor(valid_tokens, dtype=torch.long) train_y_tensor = torch.tensor(np.hstack([ train_label[:, np.newaxis], train_type_labels, train_identity_type_labels, train_np_labels ]), dtype=torch.float32) valid_y_tensor = torch.tensor(np.hstack([ valid_label[:, np.newaxis], valid_type_labels, valid_identity_type_labels, valid_np_labels ]), dtype=torch.float32) target_weight_tensor = torch.tensor(target_weight, dtype=torch.float32) aux_weight_tensor = torch.tensor(aux_weight, dtype=torch.float32) identity_weight_tensor = torch.tensor(identity_weight, dtype=torch.float32) np_weight_tensor = torch.tensor(np_weight, dtype=torch.float32) np_identity_weight_tensor = torch.tensor(np_identity_weight, dtype=torch.float32) if torch.cuda.is_available(): train_x_tensor = train_x_tensor.cuda() valid_x_tensor = valid_x_tensor.cuda() train_y_tensor = train_y_tensor.cuda() valid_y_tensor = valid_y_tensor.cuda() target_weight_tensor = target_weight_tensor.cuda() aux_weight_tensor = aux_weight_tensor.cuda() identity_weight_tensor = identity_weight_tensor.cuda() np_weight_tensor = np_weight_tensor.cuda() np_identity_weight_tensor = np_identity_weight_tensor.cuda() # 将 tensor 转成 dataset,训练数据和标签一一对应,用 dataloader 加载的时候 dataset[:-1] 是 x,dataset[-1] 是 y train_dataset = data.TensorDataset(train_x_tensor, train_y_tensor, target_weight_tensor, aux_weight_tensor, identity_weight_tensor, np_weight_tensor, np_identity_weight_tensor) valid_dataset = data.TensorDataset(valid_x_tensor, valid_y_tensor) # 将 dataset 转成 dataloader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False) # 返回训练数据 return train_loader, valid_loader, tokenizer
def training(): # One Epoch is when an Entire dataset is passed forward and backward through the neural network only ONCE! epochs = 1000 #Total number of training example present in a sigle batch batch_size = 32 training_percentage = 0.8 # file path file_path = os.path.dirname(os.path.abspath(inspect.stack()[0][1])) savings = file_path + '\\savings\\' dataset = file_path + '\\dataset\\' tokenizer_file = os.path.join(savings, 't.pickle') encoder_file = os.path.join(savings, 'e.pickle') class_file = os.path.join(savings, 'c.pickle') model_file = os.path.join(savings, 'm.h5') dataset_file = os.path.join(dataset, 'Youtube01-Psy.csv') #Create savings folder if not exists os.makedirs(os.path.dirname(tokenizer_file), exist_ok=True) removeFile(tokenizer_file) removeFile(encoder_file) removeFile(class_file) removeFile(model_file) data = pd.read_csv(dataset_file) print(data.head()) training_size = int(len(data) * training_percentage) train_content = data['CONTENT'][:training_size] train_class = data['CLASS'][:training_size] test_content = data['CONTENT'][training_size:] test_class = data['CLASS'][training_size:] number_words_dataset = countingWords(train_content) tokenize = text.Tokenizer(num_words=number_words_dataset, char_level=False) tokenize.fit_on_texts(train_content) # tf-idf x_train = tokenize.texts_to_matrix(train_content, mode='tfidf') x_test = tokenize.texts_to_matrix(test_content, mode='tfidf') with open(tokenizer_file, 'wb') as handle: pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL) encoder = LabelEncoder() encoder.fit(train_class) y_train = encoder.transform(train_class) y_test = encoder.transform(test_class) with open(encoder_file, 'wb') as handle: pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL) num_classes = np.max(y_train + 1) with open(class_file, 'wb') as handle: pickle.dump(num_classes, handle) y_train = utils.to_categorical(y_train, num_classes) y_test = utils.to_categorical(y_test, num_classes) ############################################################################################# model = Sequential() model.add( Dense(num_classes * 8, input_shape=(number_words_dataset, ), activation='relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes * 4, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(num_classes * 2, activation='relu')) model.add(Dropout(0.2)) #output layer model.add(Dense(num_classes, activation='softmax')) ############################################################################################# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) # model.compile(loss= 'categorical_crossentropy', # optimizer= 'adam', # metrics=['accuracy']) stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=1, mode='auto', baseline=None) model_history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1, callbacks=[stopper]) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) print("\n\n Test score: ", score[0]) print("\n\n Test accuracy: ", score[1]) model.save(model_file) # plot with losses loss = model_history.history['loss'] val_loss = model_history.history['val_loss'] plt.plot(loss) plt.plot(val_loss) plt.legend(['loss', 'val_loss']) plt.ylabel('Loss', fontsize=15) plt.xlabel('Epochs', fontsize=15) plt.show() text_labels = encoder.classes_ y_softmax = model.predict(x_test) y_test_1d = [] y_pred_1d = [] for i in range(len(y_test)): probs = y_test[i] index_arr = np.nonzero(probs) one_hot_index = index_arr[0].item(0) y_test_1d.append(one_hot_index) for i in range(0, len(y_softmax)): probs = y_softmax[i] predicted_index = np.argmax(probs) y_pred_1d.append(predicted_index) def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title, fontsize=20) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45, fontsize=15) plt.yticks(tick_marks, classes, fontsize=15) fmt = '.2f' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label', fontsize=20) plt.xlabel('Predicted label', fontsize=20) cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d) plt.figure(figsize=(44, 37)) plot_confusion_matrix(cnf_matrix, classes=text_labels, title="Confusion matrix") plt.show()
batch_size = 1000 num_samples = len(Y) # input_shape = X[0].size epochs = 100 num_layers = 2 train_sent,test_sent, y_train, y_test = train_test_split(X, Y, shuffle=True, train_size=0.9) print(len(X), len(train_sent)) max_words = 3000 tokenize = text.Tokenizer(num_words=max_words, char_level=False) tokenize.fit_on_texts(train_sent) # only fit on train # pickle.dump(tokenize, open('models/CNN/modelTokenizer.h5', 'wb+')) x_train = tokenize.texts_to_matrix(train_sent) x_test = tokenize.texts_to_matrix(test_sent) print(len(x_train),len(x_train[0])) # encoder = LabelEncoder() # encoder.fit(train_tags) # y_train = encoder.transform(train_tags) # y_test = encoder.transform(test_tags) # # num_classes = np.max(y_train) + 1 y_train = utils.to_categorical(y_train, num_classes)
embed_size=300 class RocAucEvaluation(Callback): def __init__(self, validation_data=(), interval=1): super(Callback, self).__init__() self.interval = interval self.X_val, self.y_val = validation_data def on_epoch_end(self, epoch, logs={}): if epoch % self.interval == 0: y_pred = self.model.predict(self.X_val, verbose=0) score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score)) tok=text.Tokenizer(num_words=max_features,lower=True) tok.fit_on_texts(list(X_train)) X_train=tok.texts_to_sequences(X_train) x_train=sequence.pad_sequences(X_train,maxlen=maxlen) embeddings_index = {} with open(EMBEDDING_FILE,encoding='utf8') as f: for line in f: values = line.rstrip().rsplit(' ') word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs word_index = tok.word_index #prepare embedding matrix num_words = min(max_features, len(word_index) + 1)
def experiment(dev_id, model_dir): print 80 * "=" print "INITIALIZING" print 80 * "=" df_train = [] df_train_idx = filter(lambda x: x != dev_id, range(1, 11)) for i in df_train_idx: df_train.append( pd.read_csv(os.path.join('split', 'train-' + str(i) + '.csv'))) df_train = pd.concat(df_train) # df_train = df_train[:80] X_train_raw = map(lambda t: normalize(t), df_train["comment_text"].fillna('').values) y_train = df_train[[ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]].values print "Finish loading training data" df_dev = pd.read_csv(os.path.join('split', 'train-' + str(dev_id) + '.csv')) # df_dev = df_dev[:200] X_dev_raw = map(lambda t: normalize(t), df_dev["comment_text"].fillna('').values) y_dev = df_dev[[ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]].values print "Finish loading dev data" df_test = pd.read_csv(os.path.join('test.csv')) # df_test = df_test[:200] X_test_raw = map(lambda t: normalize(t), df_test["comment_text"].fillna('').values) print "Finish loading test data" tokenizer = text.Tokenizer(num_words=MAX_FEATURES, char_level=True) tokenizer.fit_on_texts( list(X_train_raw) + list(X_dev_raw) + list(X_test_raw)) X_train = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train_raw), maxlen=MAX_SEQUENCE_LENGTH) X_dev = sequence.pad_sequences(tokenizer.texts_to_sequences(X_dev_raw), maxlen=MAX_SEQUENCE_LENGTH) X_test = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test_raw), maxlen=MAX_SEQUENCE_LENGTH) word_index = tokenizer.word_index print word_index valid_features = min(MAX_FEATURES, len(word_index)) + 1 print valid_features embeddings_matrix = np.r_[np.zeros((1, valid_features - 1)), np.eye(valid_features - 1, dtype=int)] print embeddings_matrix def get_model(): inp = Input(shape=(MAX_SEQUENCE_LENGTH, )) x = Embedding(valid_features, valid_features - 1, weights=[embeddings_matrix], trainable=False)(inp) x_in = SpatialDropout1D(0.5)(x) x = Bidirectional( GRU(int(sys.argv[1].split('_')[2]), return_sequences=True, recurrent_dropout=float(sys.argv[1].split('_')[0])))(x_in) x = Bidirectional( GRU(int(sys.argv[1].split('_')[2]), return_sequences=True, recurrent_dropout=float(sys.argv[1].split('_')[1])))(x) x_raw = Dense(int(sys.argv[1].split('_')[2]) * 2, activation='relu', kernel_initializer='glorot_normal')(x_in) x = Add()([x, x_raw]) x = Activation('relu')(x) avg_pool = GlobalAveragePooling1D()(x) max_pool = GlobalMaxPooling1D()(x) conc = concatenate([avg_pool, max_pool]) outp = Dense(6, activation="sigmoid")(conc) model = Model(inputs=inp, outputs=outp) model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy']) return model model = get_model() ra_val = RocAucMetricCallback() # include it before EarlyStopping! filepath = os.path.join(model_dir, "weights_base.best.hdf5") checkpoint = ModelCheckpoint(filepath, monitor='roc_auc_val', verbose=2, save_best_only=True, mode='max') early = EarlyStopping(monitor="roc_auc_val", mode="max", patience=5) callbacks_list = [ra_val, checkpoint, early] model.fit(X_train, y_train, batch_size=128, epochs=8, validation_data=(X_dev, y_dev), callbacks=callbacks_list, verbose=2) # 注意:要加载保存的最优模型 model.load_weights(filepath) y_train_predict = model.predict(X_train, batch_size=INFERENCE_BATCH_SIZE, verbose=2) submission = pd.DataFrame.from_dict({'id': df_train['id']}) submission['comment_text'] = X_train_raw class_names = { 0: 'toxic', 1: 'severe_toxic', 2: 'obscene', 3: 'threat', 4: 'insult', 5: 'identity_hate' } for (id, class_name) in class_names.items(): submission[class_name] = y_train_predict[:, id] submission.to_csv(os.path.join(model_dir, 'predict-keras-train.csv'), index=True) print "- AUC: ", roc_auc_score(y_train, y_train_predict) print "Finish train set prediction" y_dev_predict = model.predict(X_dev, batch_size=INFERENCE_BATCH_SIZE, verbose=2) submission = pd.DataFrame.from_dict({'id': df_dev['id']}) submission['comment_text'] = X_dev_raw class_names = { 0: 'toxic', 1: 'severe_toxic', 2: 'obscene', 3: 'threat', 4: 'insult', 5: 'identity_hate' } for (id, class_name) in class_names.items(): submission[class_name] = y_dev_predict[:, id] submission.to_csv(os.path.join(model_dir, 'predict-keras-dev.csv'), index=True) print "- AUC: ", roc_auc_score(y_dev, y_dev_predict) print "Finish dev set prediction" y_test_predict = model.predict(X_test, batch_size=INFERENCE_BATCH_SIZE, verbose=2) submission = pd.DataFrame.from_dict({'id': df_test['id']}) class_names = { 0: 'toxic', 1: 'severe_toxic', 2: 'obscene', 3: 'threat', 4: 'insult', 5: 'identity_hate' } for (id, class_name) in class_names.items(): submission[class_name] = y_test_predict[:, id] submission.to_csv(os.path.join(model_dir, 'submit.csv'), index=False) print "Finish test set prediction" return 0
x_train = train_df[TEXT_COLUMN].astype(str) y_train = train_df[TARGET_COLUMN].values y_aux_train = train_df[AUX_COLUMNS].values x_test = test_df[TEXT_COLUMN].astype(str) # ============================================================================= # Data processing # ============================================================================= for column in IDENTITY_COLUMNS + [TARGET_COLUMN]: train_df[column] = np.where(train_df[column] >= 0.5, True, False) tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE) tokenizer.fit_on_texts(list(x_train) + list(x_test)) x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN) x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN) print("---- Weights") sample_weights = np.ones(len(x_train), dtype=np.float32) sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1) sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1) sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5 sample_weights /= sample_weights.mean()
raw_data = raw_data.as_matrix() y_train = raw_data[:, 0].tolist() num = len(y_train) x_train = raw_data[:, 1].tolist() #read test data x_test = [ line.rstrip('\n') for line in open('testing_data.txt', 'r', encoding='UTF-8') ] x_test = [line.split(',', 1)[1] for line in x_test] del x_test[0] #x_unlabeled = [line.rstrip('\n') for line in open('training_nolabel.txt', 'r', encoding='UTF-8')] #tokenized t = text.Tokenizer(num_words=max_word_idx, filters='\t\n') t.fit_on_texts(x_train + x_test) np.save('t.npy', t) x_train = t.texts_to_sequences(x_train) #x_unlabeled = t.texts_to_sequences(x_unlabeled) x_test = t.texts_to_sequences(x_test) #preprocess x_train = sequence.pad_sequences(x_train, maxlen=max_sequence_len) #x_unlabeled = sequence.pad_sequences(x_unlabeled, maxlen = max_sequence_len) x_test = sequence.pad_sequences(x_test, maxlen=max_sequence_len) x_train = np.asarray(x_train) y_train = np.asarray(y_train).reshape(-1, 1) #x_unlabeled = np.asarray(x_unlabeled) x_test = np.asarray(x_test)
wandb.init() config = wandb.config # set parameters: config.vocab_size = 1000 config.maxlen = 300 config.batch_size = 32 config.embedding_dims = 50 config.filters = 250 config.kernel_size = 3 config.hidden_dims = 100 config.epochs = 10 (X_train, y_train), (X_test, y_test) = imdb.load_imdb() tokenizer = text.Tokenizer(num_words=config.vocab_size) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_matrix(X_train) X_test = tokenizer.texts_to_matrix(X_test) X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen) X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen) embeddings_index = dict() f = open('glove.6B.100d.txt') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close()
lambda x: " ".join(i for i in jieba.cut(x))) fw = open(save_feature, 'wb') pickle.dump(df, fw) fw.close() else: print("特征存在,直接加载...") fw = open(save_feature, 'rb') df = pickle.load(fw) fw.close() ###取出第三列的所有行 X_train = df.iloc[:train_row, 3] X_test = df.iloc[train_row:, 3] ###Tokenizer进行词法分析 tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(list(X_train) + list(X_test)) ###转换word下标的向量形式 X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) ###将序列填充到maxlen长度 x_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH) test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH) ###训练集结果集 y = df_train['COMMLEVEL'].values ###找出lstm权重 word_index = tokenizer.word_index
return data = pd.read_csv("preprocess/train_char.csv") data["content"] = data.apply(lambda x: eval(x[1]), axis=1) validation = pd.read_csv("preprocess/validation_char.csv") validation["content"] = validation.apply(lambda x: eval(x[1]), axis=1) model_dir = "model_bigru_char/" maxlen = 1200 max_features = 20000 batch_size = 128 epochs = 15 tokenizer = text.Tokenizer(num_words=None) tokenizer.fit_on_texts(data["content"].values) with open('tokenizer_char.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # word_index 是 6W+ 么? word_index = tokenizer.word_index # word2vec : 7983 100 word2vec/chars.vector w2_model = KeyedVectors.load_word2vec_format("word2vec/chars.vector", binary=True, encoding='utf8', unicode_errors='ignore') embeddings_index = {}
data = pd.read_csv("stack-overflow.csv") # Split dataset into 80% training data and 20% test data EIGHTY_PERCENT = 0.80 train_size = int(len(data) * EIGHTY_PERCENT) train_posts = data['post'][:train_size] train_tags = data['tags'][:train_size] # which means 20% will be test data test_posts = data['post'][train_size:] test_tags = data['tags'][train_size:] # the vocabulary size for our model - the top 1000 most commonly used words vocabulary_size = 1000 # Use Kera's Tokenizer class tokenize = text.Tokenizer(num_words=vocabulary_size) tokenize.fit_on_texts(train_posts) # Create the training data from the collection of posts to pass to the model # Creates a vocabulary_size “bag” array, with 1s indicating the indices # where words in a question are present in the vocabulary x_train = tokenize.texts_to_matrix(train_posts) # training dataset: 1s and 0s representation of the tokens in the StackOverflow posts data # # [[0. 1. 1. ... 0. 0. 0.] # [0. 1. 1. ... 0. 0. 0.] # [0. 1. 1. ... 0. 0. 0.] # ... # [0. 1. 1. ... 0. 0. 0.] # [0. 1. 1. ... 0. 0. 1.]
def senti_preprocess(): train_ori, test_ori = _read_data('train.pd', 'test.pd') train_ori['text_list'] = train_ori['text_list'].apply( lambda list: _remove_pattern_2(list)) test_ori['text_list'] = test_ori['text_list'].apply( lambda list: _remove_pattern_2(list)) # 把数据的随机shuffle的顺序保存下来,后面senti的数据也做同样的shuffle train_random = np.random.permutation(len(train_ori)) test_random = np.random.permutation(len(test_ori)) train_ori = train_ori.iloc[train_random] # 手动shuffle test_ori = test_ori.iloc[test_random] # 将每个用户的推文连起来,主要是为了后面tokenizer.fit_on_texts train_text = train_ori['text_list'].apply(lambda list: " ".join(list)) test_text = test_ori['text_list'].apply(lambda list: " ".join(list)) # 变换label,构造Y标签数据集 Y_train = train_ori['label'].apply(lambda gender: 1 if gender == 'male' else 0) Y_test = test_ori['label'].apply(lambda gender: 1 if gender == 'male' else 0) # fit tokenizer tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE) #词汇表最多单词数 tokenizer.fit_on_texts( list(train_text) + list(test_text) ) #Updates internal vocabulary based on a list of texts. # 将每个用户的text_list,变成序列,然后变成等长的序列 train_ori['seq'] = train_ori['text_list'].apply( lambda list: tokenizer.texts_to_sequences(list)) test_ori['seq'] = test_ori['text_list'].apply( lambda list: tokenizer.texts_to_sequences(list)) train_ori['seq'] = train_ori['seq'].apply( lambda list: sequence.pad_sequences(list, maxlen=MAXLEN)) test_ori['seq'] = test_ori['seq'].apply( lambda list: sequence.pad_sequences(list, maxlen=MAXLEN)) # 将等长的序列变为numpy数据 X_train = np.array(list(train_ori['seq'])) X_test = np.array(list(test_ori['seq'])) print(X_train.shape) print(X_test.shape) #################### 下面准备senti的数据 senti = pd.read_pickle( os.path.join(os.path.dirname(__file__), '..', 'output', 'senti_ori.pd')) senti['text'] = _remove_pattern_2(list(senti['text'])) senti = senti.iloc[np.random.permutation(len(senti))] # 手动shuffle senti_text_list = list(senti['text']) senti_tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE) # 词汇表最多单词数 senti_tokenizer.fit_on_texts( senti_text_list + list(train_text) + list(test_text) ) # Updates internal vocabulary based on a list of texts. senti_text_seq = senti_tokenizer.texts_to_sequences( senti_text_list ) # Transforms each text in texts in a sequence of integers senti_train_seq = senti_tokenizer.texts_to_sequences(list(train_text)) senti_test_seq = senti_tokenizer.texts_to_sequences(list(test_text)) X_senti = sequence.pad_sequences( senti_text_seq, maxlen=LONG_MAXLEN ) # Pads each sequence to the same length (length of the longest sequence) X_senti = np.array(list(X_senti)) Y_senti = senti['label'] print(X_senti.shape) print(Y_senti.shape) senti_train = sequence.pad_sequences(senti_train_seq, maxlen=LONG_MAXLEN) senti_test = sequence.pad_sequences(senti_test_seq, maxlen=LONG_MAXLEN) senti_train = np.array(list(senti_train)) senti_test = np.array(list(senti_test)) print(senti_train.shape) print(senti_test.shape) return X_train, Y_train, X_test, Y_test, tokenizer, X_senti, Y_senti, senti_train, senti_test
stop_words = stopwords.words("english") def lemmatize(x): lemmatized = [] for post in x: temp = post.lower() for type_ in types: temp = temp.replace(' ' + type_, '') temp = ' '.join([ lemmatizer.lemmatize(word) for word in temp.split(' ') if (word not in stop_words) ]) lemmatized.append(temp) return np.array(lemmatized) tokenizer = text.Tokenizer(num_words=TOP_WORDS, split=' ') tokenizer.fit_on_texts(lemmatize(x_train)) def preprocess(x): lemmatized = lemmatize(x) tokenized = tokenizer.texts_to_sequences(lemmatized) return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH) ### Assign to dataframe and shuffle rows df = pd.DataFrame(data={'x': x_train, 'y': y_train}) df = df.sample(frac=1).reset_index(drop=True) ### Shuffle rows if SAMPLE: df = df.head(10000) ### Small sample for quick runs ### Load glove into memory for embedding embeddings_index = dict()