def sequence_vectorize(self, train_texts, val_texts): """Vectorizes texts as sequence vectors. 1 text = 1 sequence vector with fixed length. # Arguments train_texts: list, training text strings. val_texts: list, validation text strings. # Returns x_train, x_val, word_index: vectorized training and validation texts and word index dictionary. """ # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=self.TOP_K) tokenizer.fit_on_texts(train_texts) # Vectorize training and validation texts. x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) # Get max sequence length. max_length = len(max(x_train, key=len)) if max_length > self.MAX_SEQUENCE_LENGTH: max_length = self.MAX_SEQUENCE_LENGTH # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) return x_train, x_val, tokenizer.word_index
def build_embedding_matrix(x_train, x_test, maxlen, first_time, file_path): #Tokenizer if first_time: tokenizer = text.Tokenizer() tokenizer.fit_on_texts(x_train) dump(tokenizer, 'tokenizer.joblib', compress=3) else: tokenizer = load('tokenizer.joblib') #Word index word_index = tokenizer.word_index #Embedding matrix if first_time: print('Loading embedding index') embedding_index = load_embedding_index(file_path) print('Building our embedding matrix') embedding_matrix = np.zeros((len(word_index) + 1, 300)) for word, i in word_index.items(): embedding_vector = embedding_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector dump(embedding_matrix, 'embedding_matrix.joblib', compress=3) else: embedding_matrix = load('embedding_matrix.joblib') # Tokenzing + padding seq_x_train = sequence.pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=maxlen) seq_x_test = sequence.pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=maxlen) return seq_x_train, seq_x_test, embedding_matrix, word_index
def preprocess(data): tokenizer = text.Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(data) data_x = tokenizer.texts_to_sequences(data) data_x = sequence.pad_sequences(data_x, maxlen=max_length) data_x = np.array(data_x) return data_x
def preprocess(data_x, data_y, train_test_ratio=0.9): #Build vocabulary max_length = max([len(text.split(" ")) for text in data_x]) vectorizer = CountVectorizer(lowercase=True, max_df=100) vectorizer.fit(data_x) vocab_size = len(vectorizer.vocabulary_) tokenizer = text.Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(data_x) data_x = tokenizer.texts_to_sequences(data_x) data_x = sequence.pad_sequences(data_x, maxlen=max_length) data_x = np.array(data_x) #vocab_processor = learn.preprocessing.VocabularyProcessor(max_length) #data_x = np.array(list(vocab_processor.fit_transform(data_x))) data_y = np.array(data_y) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(data_y.shape[0])) x_shuffled = data_x[shuffle_indices] y_shuffled = data_y[shuffle_indices] #Divide data data_length = len(x_shuffled) dividing_index = int(train_test_ratio * data_length) train_x = x_shuffled[:dividing_index] train_y = y_shuffled[:dividing_index] test_x = x_shuffled[dividing_index:] test_y = y_shuffled[dividing_index:] return train_x, train_y, test_x, test_y, vocab_size
def tokenize(sent_list): tokenizer = text.Tokenizer(filters='', oov_token='<unk>') tokenizer.fit_on_texts(sent_list) tensor_list = tokenizer.texts_to_sequences(sent_list) tensor_list = sequence.pad_sequences(tensor_list, padding='post') return {'Tensors': tensor_list, 'Tokenizer': tokenizer}
def get_tokenizer(): with open(PATH + '/config/ruvec/vocab.txt') as f: words = [i[:-1] for i in f.readlines()] csv = read_csv(PATH + '/assets/csv/names.csv') train = csv['text'] token = text.Tokenizer(num_words=None) token.fit_on_texts(list(train) + words) return token
def _init_tokenizer(self): '''Initialise tokenizer for processing decay strings''' tokenize = text.Tokenizer( num_words=self.num_pdg_codes, filters='!"#$%&*+,./:;=?@[\]^_`{|}~' ) tokenize.fit_on_texts(evtPdl.pdgTokens) return tokenize
def create_tokenizer(self, text_list): """ This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf. """ tokenizer = text.Tokenizer(num_words=self._vocab_size) tokenizer.fit_on_texts(text_list) self._tokenizer = tokenizer
def sequence_vectorize(train_texts, val_texts): tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) max_len = len(max(x_train, key=len)) if max_len > MAX_SEQUENCE_LENGTH: max_len = MAX_SEQUENCE_LENGTH x_train = sequence.pad_sequences(x_train, maxlen = max_len) x_val = sequence.pad_sequences(x_val, maxlen=max_len) return x_train, x_val, tokenizer.word_index
def create__inputs_outputs(train_dataframe, test_dataframe, num_of_tokenizer): number_of_words = num_of_tokenizer tokenizer = text.Tokenizer(num_words=number_of_words) tokenizer.fit_on_texts(train_dataframe['sentence']) train_tokens_left, train_tokens_right, train_tokens_all, test_tokens_left, test_tokens_right, test_tokens_all = create_sequences( tokenizer, train_dataframe, test_dataframe) labels_train = np.asanyarray( pd.get_dummies(train_dataframe['label'], prefix=['label'])) labels_test = np.asanyarray( pd.get_dummies(test_dataframe['label'], prefix=['label'])) return tokenizer, train_tokens_left, train_tokens_right, train_tokens_all, test_tokens_left, test_tokens_right, test_tokens_all, labels_train, labels_test
def __init__(self, args): print('init processor') self._vocab_size = args.NB_WORDS self._max_sequence_length = args.MAX_SEQUENCE_LENGTH self._tokenizer = text.Tokenizer(num_words= args.NB_WORDS , filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', lower=True, char_level=False) # save all vocabularies self._pad_sequences = pad_sequences self._re_sub = re.sub self._head_word=args.START_WORD self._end_word=args.END_WORD
def get_tokenizer_and_train(divided_code_path: str, save_tokenizer_path: str, save_file_path: str, min_count: int, max_len: int) -> None: with open(divided_code_path, "rb") as f: all_word = pickle.load(f) tokenizer = text.Tokenizer(filters="", lower=False, char_level=False) tokenizer.fit_on_texts(all_word) treated_str_train = [] count = 0 dir_file_list = [] final_train = [] for s in all_word: sentence = [] for w in s: if tokenizer.word_counts[w] >= min_count: sentence.append(w) else: sentence.append("@OTHER") treated_str_train.append(sentence) dir_file_list.append(sentence) count += 1 if count == 500: final_train.append(dir_file_list) dir_file_list = [] count = 0 save_tokenizer = text.Tokenizer(filters="", lower=False, char_level=False) save_tokenizer.fit_on_texts(treated_str_train) for i in range(len(final_train)): term1 = save_tokenizer.texts_to_sequences(final_train[i]) final_train[i] = pad_sequences(term1, maxlen=max_len) with open(save_tokenizer_path, "wb") as f: pickle.dump(save_tokenizer, f) with open(save_file_path, "wb") as f: pickle.dump(final_train, f) return
def prepare_text_for_cbow(all_words): tokenizer = text.Tokenizer() tokenizer.fit_on_texts(all_words[0]) word2id = tokenizer.word_index # build vocabulary of unique words word2id['PAD'] = 0 id2word = {v: k for k, v in word2id.items()} wids = [word2id[w] for w in text.text_to_word_sequence(all_words[0][0])] vocab_size = len(word2id) embed_size = 10 window_size = 2 # context window size print('Vocabulary Size:', vocab_size) print('Vocabulary Sample:', list(word2id.items())[:10]) return wids, vocab_size, embed_size, window_size, id2word, word2id
def fit(self, corpus): # load train and test data from the corpus train_data = corpus.train_data test_data = corpus.test_data # pre-processing by helper functions x_train, y_train = helper.get_text_and_labels(train_data) x_test, y_test = helper.get_text_and_labels(test_data) # tokenize, pad, convert to matrix tokenizer = text.Tokenizer(num_words=self.max_words, char_level=False) tokenizer.fit_on_texts(x_train) x_train = tokenizer.texts_to_matrix(x_train) x_test = tokenizer.texts_to_matrix(x_test) x_train = sequence.pad_sequences(x_train, maxlen=self.sequence_length) x_test = sequence.pad_sequences(x_test, maxlen=self.sequence_length) # one hot encode labels encoder = LabelEncoder() encoder.fit(y_train) # update class names self.class_names = encoder.classes_ y_train_encoded = encoder.transform(y_train) y_test_encoded = encoder.transform(y_test) # update self.n_classes = np.max(y_train_encoded) + 1 y_train_encoded = utils.to_categorical(y_train_encoded, self.n_classes) y_test_encoded = utils.to_categorical(y_test_encoded, self.n_classes) self.x_train = x_train self.x_test = x_test self.y_train = y_train_encoded self.y_test = y_test_encoded # update weights self.__load_weights(vocabulary=tokenizer.word_index)
def __init__(self, train_texts, calculate_vocab): ''' calculate_vocab: Boolean, if True we calculate the vocabulary from the dataset. ''' #Get vocabulary. if calculate_vocab == False: with open("PretrainedEmbedding/vocab.txt") as f: self.word_index = {} i = 1 for line in f: (key, _) = line.split() self.word_index[key] = i i += 1 if i == MAX_FEATURES + 1: break elif calculate_vocab == True: tokenizer = text.Tokenizer() tokenizer.fit_on_texts(train_texts) self.word_index = dict( itertools.islice(tokenizer.word_index.items(), MAX_FEATURES))
def get_divided_code_with_min_count(divided_code_path: str, save_tokenizer_path: str, save_file_path: str, min_count: int) -> None: # 首先所有的出现次数小于min_count的词替换成同一个词 with open(divided_code_path, "rb") as f: all_word = pickle.load(f) tokenizer = text.Tokenizer(filters="", lower=False, char_level=False) tokenizer.fit_on_texts(all_word) # i = tokenizer.word_counts with open(save_tokenizer_path, "wb") as f: pickle.dump(tokenizer, f) with open(save_file_path, "w") as f: for s in all_word: for w in s: if tokenizer.word_counts[w] >= min_count: f.write(w + " ") else: f.write("@OTHER ") f.write("\n") return
def sequence_vectorize(train_texts, val_texts, test_texts, model_dir): """Vectorizes texts as sequence vectors. 1 text = 1 sequence vector with fixed length. # Arguments train_texts: list, training text strings. val_texts: list, validation text strings. test_texts: list, testing text strings. model_dir, string, locaction of vectorizer and selector will be saved. # Returns x_train, x_val, x_test, word_index: vectorized training, validation texts and testing texts and word index dictionary. """ # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) # Vectorize training and validation texts. x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) x_test = tokenizer.texts_to_sequences(test_texts) # Get max sequence length. max_length = len(max(x_train, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) x_test = sequence.pad_sequences(x_test, maxlen=max_length) pickle.dump(tokenizer, open(os.path.join(model_dir, 'tokenizer.pickle'), "wb")) pickle.dump(max_length, open(os.path.join(model_dir, 'max_length.pickle'), "wb")) return x_train, x_val, x_test, tokenizer.word_index
def nn_setup(x, y, max_length=450, tokenizer_name='nn_model.pkl'): x_train, x_test, y_train, y_test = train_test_split(x,y) tokenizer = text.Tokenizer(num_words=28331) tokenizer.fit_on_texts(x_train) train_sequences = tokenizer.texts_to_sequences(x_train) test_sequences = tokenizer.texts_to_sequences(x_test) tokenizer_pkl = open(tokenizer_name, 'wb') pickle.dump(tokenizer, tokenizer_pkl) files.download(tokenizer_name) tokenizer_pkl.close() x_train_seq = sequence.pad_sequences(train_sequences, maxlen=max_length) x_test_seq = sequence.pad_sequences(test_sequences, maxlen=max_length) y_train_seq = to_categorical(y_train) y_test_seq = to_categorical(y_test) weights= compute_class_weight('balanced', np.unique(y_train), y_train) weights_dict = dict(zip( np.unique(y_train),weights)) return tokenizer, x_train_seq, x_test_seq, y_train_seq, y_test_seq, weights_dict
def get_divided_code(train_path: str, map_save_path: str, train_save_path: str, save_tokenizer_path: str, word_num: int) -> None: # 在代码中不同成分之间添加空格 all_word = [] all_word_in_one_line = [] for root, dirs, files in os.walk(train_path): dir_test = [] for name in files: with open(os.path.join(root, name), "r") as f: code = f.read() r = discrete_code(code) dir_test.append(r) all_word_in_one_line.extend(r) if dir_test: all_word.append(dir_test) tokenizer = text.Tokenizer(lower=False, char_level=False) tokenizer.fit_on_texts([all_word_in_one_line]) with open(save_tokenizer_path, "wb") as f: pickle.dump(tokenizer, f) selected_word = sorted(tokenizer.word_counts.items(), key=lambda a: -a[1]) selected_word = selected_word[:word_num - 1] word_map = dict() count = 0 for w, _ in selected_word: word_map[w] = count count += 1 with open(map_save_path, "wb") as f: pickle.dump(word_map, f) test_count = [] for d in all_word: dir_test = [] for s in d: dir_test.append(deal_with_s(word_map, s, word_num)) test_count.append(dir_test) with open(train_save_path, "wb") as f: pickle.dump(test_count, f)
def fit(self, instances): tokenizer = text.Tokenizer(lower=False, filters=[], oov_token=None) tokenizer.fit_on_texts(instances) self._tokenizer = tokenizer self.number_words = len(tokenizer.word_index) print(self.number_words)
# then fillall NaN values by emtpy string '' (redundant) x_train = train['comment_text'].fillna('').values # if true, y_train[i] =1, if false, it is = 0 y_train = np.where(train['target'] >= 0.5, 1, 0) y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']] # # Take the columns 'comment_text' from test, # then fillall NaN values by emtpy string '' (redundant) x_test = test['comment_text'].fillna('').values # https://keras.io/preprocessing/text/ # tokenizer is a class with some method tokenizer = text.Tokenizer(num_words=MAX_FEATURES) # we apply method fit_on_texts of tokenizer on x_train and x_test # it will initialize some parameters/attribute inside tokenizer # https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L139 # https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L210 tokenizer.fit_on_texts(list(x_train) + list(x_test)) # for example, after fit_on_texts, we can type # tokenizer.word_counts #give a OderedDict # tokenizer.document_counts # an int # tokenizer.word_index is a dict of words with correponding indices # There are 410046 different words in all 'comment_text' # len(tokenizer.word_index) == 410_046
def generateOOVEmbeddings(): # read the (DL cleaned) dataset and build the vocabulary print('loading dataframes...') train_df = pd.read_csv('../data/training/train2.cleaned.dl.csv') test_df = pd.read_csv('../data/eval/test2.cleaned.dl.csv') # ps: forget memory and runtime, it's python here :D list_sentences_train = train_df["comment_text"].values list_sentences_test = test_df["comment_text"].values list_sentences_all = np.concatenate([list_sentences_train, list_sentences_test]) tokenizer = text.Tokenizer(num_words=400000) tokenizer.fit_on_texts(list(list_sentences_all)) print('word_index size:', len(tokenizer.word_index), 'words') word_index = tokenizer.word_index # load fastText - only the words print('loading fastText embeddings...') voc = set() f = open('/mnt/data/wikipedia/embeddings/crawl-300d-2M.vec') begin = True for line in f: if begin: begin = False else: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('fastText embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('fastText embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-fastText.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close() # load gloves - only the words print('loading gloves embeddings...') voc = set() f = open('/mnt/data/wikipedia/embeddings/glove.840B.300d.txt') for line in f: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('gloves embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('gloves embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-gloves.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close() # load word2vec - only the words print('loading word2vec embeddings...') voc = set() f = open('/mnt/data/wikipedia/embeddings/GoogleNews-vectors-negative300.vec') begin = True for line in f: if begin: begin = False else: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('word2vec embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('word2vec embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-w2v.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close() # load numberbatch - only the words print('loading numberbatch embeddings...') voc = set() f = open('/mnt/data/wikipedia/embeddings/numberbatch-en-17.06.txt') begin = True for line in f: if begin: begin = False else: values = line.split() word = ' '.join(values[:-300]) voc.add(word) f.close() print('numberbatch embeddings:', len(voc), 'words') oov = [] for tokenStr in word_index: if not tokenStr in voc: oov.append(tokenStr) print('numberbatch embeddings:', len(oov), 'out-of-vocabulary') with open("../data/training/oov-numberbatch.txt", "w") as oovFile: for w in oov: oovFile.write(w) oovFile.write('\n') oovFile.close()
embedding_matrix[i] = word2vec.word_vec(word) return embedding_matrix def f1_score(y_true, y_pred): true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) recall = true_positives / (possible_positives + K.epsilon()) f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon()) return f1_val MAX_LEN = 300 tokenizer = text.Tokenizer() tokenizer.fit_on_texts(list(x_train) + list(x_test)) x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN) x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN) embedding_models = [etnlp_word2vec, sonvx_word2vec] embedding_matrix = np.concatenate( [build_matrix(tokenizer.word_index, f) for f in embedding_models], axis=-1) LSTM_UNITS = 128 DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
def get_token(text_list,num_words = VOCAB_SIZE): tokenizer = text.Tokenizer(num_words) tokenizer.fit_on_texts(text_list) return tokenizer
print(x.shape) x[8712] = ' ' from tensorflow.keras.models import Model from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Input, Dropout from tensorflow.keras.layers import Conv1D, AveragePooling1D, MaxPooling1D from tensorflow.keras.preprocessing import sequence, text from tensorflow.keras.metrics import AUC from tensorflow.keras.optimizers import Adam maxlen = 1000 max_words = 20000 embedding_dim = 50 tokenizer = text.Tokenizer( filters='0123456789!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', num_words=max_words) tokenizer.fit_on_texts(x) x_train = tokenizer.texts_to_sequences(x) x_train = sequence.pad_sequences(x_train, maxlen=maxlen) print(x_train.shape) y_train = y inp = Input(shape=(maxlen, )) x = Embedding(max_words, embedding_dim, input_length=maxlen)(inp) x = Conv1D(32, 3, strides=1, padding='same', activation='relu')(x) x = MaxPooling1D(3)(x) x = LSTM(32)(x) x = Flatten()(x) x = Dense(128, activation='relu')(x)
def __init__(self, filename, name='data_source', input_name='input', output_Name='reference', vocab_size=100, batch_size=200, skip=500): with tf.name_scope(name): # Create tokenizer to map the strings onto sequences self.tokenizer = kpt.Tokenizer(num_words=vocab_size, filters='', lower=False, split='', char_level=True, oov_token=None) # Fit tokenizer on text with open(filename, 'r') as file: text = file.read() self.tokenizer.fit_on_texts([text]) # Store reverse mapping self.reverse_mapping = { v: k for k, v in self.tokenizer.word_index.items() } # Create primary dataset # This dataset contains a rank-0 tensor (scalar) per line # Each such tensor is of type 'string' filename = "iliad.txt" dataset = tf.data.TextLineDataset(filename).skip(skip).repeat() # Convert string dataset into a binary category dataset # Each ()-tensor (containing one string) is created into a (l, v) tensor where # - l is the length of the string # - v is the size of the vocabulary # where output[i, j] == 1 if tokenizer.word_index[input[i]] == tokenizer.tokens[j] else 0 # This is done by wrapping the tokenizer's "texts_to_matrix"-method in a "py_func" tensorflow op # (which maps a single entry) and calling dataset.map with a function that returns a single such op # (and sets the shape of the output tensor) def tokenize_op(x): def tokenize(y): # Split the string into single-character lists (using the 'list' constructor) # and call the texts_to_matrix method. matrix = self.tokenizer.texts_to_matrix( list(y.decode('utf-8'))) # Convert to float type return matrix.astype(np.float32) # Wrap a call to the tokenize function with a float32 result out = tf.py_func( tokenize, # Target function [x], # Arguments ( tf.float32 ), # Return type (must be specified in advance because the function is called on demand) False # Whether this operation is stateful ) # Add some shape information on the output for the tensorflow shape inference engine out.set_shape([tf.Dimension(None), vocab_size]) return out matrix_dataset = dataset.map(tokenize_op) # Batch data together by padding all sequences to the longest one # In this case the datasets have shapes (l_i, vocab_size) and the operation produces # the shapes (batch_size, max(l_i), vocab_size) batched_dataset = matrix_dataset.padded_batch( batch_size, (-1, vocab_size)) # Create input and output data by making datasets that take all but the last resp. the first element input_dataset = batched_dataset.map(lambda x: x[:, 0:-1, :]) output_dataset = batched_dataset.map(lambda x: x[:, 1:, :]) # Create iterators input_iterator = input_dataset.make_initializable_iterator() output_iterator = output_dataset.make_initializable_iterator() # Create action that just runs both initializer ops init_actions = [ input_iterator.initializer, output_iterator.initializer ] with (tf.control_dependencies(init_actions)): self.initializer = tf.no_op('Initializer') # See below input_raw = input_iterator.get_next() output_raw = output_iterator.get_next() # Create action that retrieves data # Do this outside of the name scope so that people see the nodes # nicely next to the data-source node coming out of it # The nodes are each control-dependency linked to each other's sources so that the # data-sets always step forward together, even if only one is used. with tf.control_dependencies([input_raw, output_raw]): self.data = (tf.identity(input_raw, name=name + '_input'), tf.identity(output_raw, name=name + '_ouutput'))
def get_lang_tokenize(texts): lang_tokenizer = text.Tokenizer(filters='') lang_tokenizer.fit_on_texts(texts) return lang_tokenizer
from df.loader import load_polish_train, load_forum_hate_speech # xlsx from https://github.com/ybalcanci/Hate-Speech-Detector df = load_polish_train() df = df.append(load_forum_hate_speech(), ignore_index=True) from preprocessing.pl.text_preprocessing import TextPreprocessor TextPreprocessor().clean_data_frame(df, lemmatize=True) train_posts = df['tweet'] train_tags = df['label'] max_words = 1000 # keras.preprocessing tokenize = text.Tokenizer(num_words=max_words, char_level=False) tokenize.fit_on_texts(train_posts) # only fit on train with open(r"..\..\model\pl\rnn_tokenizer", 'wb') as handle: pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL) x_train = tokenize.texts_to_matrix(train_posts) encoder = LabelEncoder() encoder.fit(train_tags) y_train = encoder.transform(train_tags) num_classes = np.max(y_train) + 1 y_train = utils.to_categorical(y_train, num_classes) # Build the model
i = t.find('\n\n') # skip header if 0 < i: t = t[i:] texts.append(t) labels.append(label_id) print('Found %s texts.' % len(texts)) # ### Vectorization # # Vectorize the text samples into a 2D integer tensor. MAX_NUM_WORDS = 10000 MAX_SEQUENCE_LENGTH = 1000 tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # ### TF Datasets # # Let's now define our TF Datasets
from tensorflow.keras.preprocessing import text, sequence data_set = pd.read_csv('data_clean.csv') #print(data_set.head()) data_train = data_set['text'][0:2000].astype(str) data_labels = data_set['isReal'][0:2000].to_numpy() X_train, X_test, y_train, y_test = train_test_split(data_train, data_labels, test_size=0.25) max_features = 1000 maxlen = 300 start = time.time() tokenizer = text.Tokenizer(num_words=max_features) tokenizer.fit_on_texts(X_train) tokenized_train = tokenizer.texts_to_sequences(X_train) X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen) tokenized_test = tokenizer.texts_to_sequences(X_test) X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen) print('keras tokenize time: ', round(time.time() - start, 2), 's') from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional from tensorflow.keras.models import Model from tensorflow.keras.callbacks import ReduceLROnPlateau batch_size = 64