def get_tokenizer_with_missing_words(text, tk_defined): rm_chars = '!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n' missing_words = list() for word in text.lower().translate(str.maketrans('', '', rm_chars)).split(): if not word in tk_defined.word_index.keys(): missing_words.append(word) # Create tokenizer of missing words tk_missing = Tokenizer(filters=rm_chars) tk_missing.fit_on_texts(missing_words) # Increase all items of a dictionary with a given value def increase_key_value(value, word_index): word_index.update( {key: word_index[key] + value for key in word_index.keys()}) return word_index # Merge two dicitonaries together def merge_dictionaries(d1, d2): return {**d1, **d2} tk_missing.word_index = increase_key_value(1379, tk_missing.word_index) # Create tokenizer, which is combinaton of tk_defined and tk_missing tk = Tokenizer(filters=rm_chars) tk.word_index = merge_dictionaries(tk_defined.word_index, tk_missing.word_index) return tk
def predict_one_sentence(self, sentence): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentence) sentence = en_tokenizer.texts_to_sequences([sentence], search_related_word=True) print(sentence) sentence = pad_sequences(sentence, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) print(sentence) prediction = self.M.predict(sentence) predicted_sentence = "" reverse_word_index = dict( (i, word) for word, i in self.de_word_index.items()) for sentence in prediction: for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print( "second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sentence += next_word + " " return predicted_sentence
def shuffleData(datas, labels, tokenflag=True): if (tokenflag): tokenizer = Tokenizer(num_words=num_words + 1, oov_token='UNK') tokenizer.fit_on_texts(datas) tokenizer.word_index = { e: i for e, i in tokenizer.word_index.items() if i <= num_words } tokenizer.word_index[tokenizer.oov_token] = 1 print(len(tokenizer.word_index)) joblib.dump(tokenizer, 'dataFile.pkl') tokenizer = joblib.load('dataFile.pkl') all_text_seq = tokenizer.texts_to_sequences(datas) all_text_test = pad_sequences(all_text_seq, maxlen=sequence_length, padding='pre', value=0) np.random.seed(100) shuffle_indices = np.random.permutation(np.arange(len(labels))) x_shuffled = np.array(all_text_test)[shuffle_indices.astype(int)] y_shuffled = np.array(labels)[shuffle_indices.astype(int)] # Split train/test set dev_sample_index = -1 * int(dev_sample_percentage * len(labels)) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del datas, labels, x_shuffled, y_shuffled, all_text_test return x_train, x_dev, y_train, y_dev, tokenizer
def get_multilabel_train_data(input_length, path="train_data_after_cut.xlsx"): df = pd.read_excel(path, encoding="utf-8") content = df["content"] filters = '!?"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n\r!@#¥%…&*():“”’‘;《》?,。' tokenizer = Tokenizer(filters=filters, lower=True, split=" ", oov_token="UNK") if os.path.exists("vocab.json"): with open("vocab.json", encoding="utf-8") as f: vocab = json.load(f) tokenizer.word_index = vocab else: tokenizer.fit_on_texts(content) vocab = tokenizer.word_index with open("vocab.json", encoding="utf-8", mode="w") as f: json.dump(vocab, f) content_list_seq = tokenizer.texts_to_sequences(content) # print(sum([len(c) for c in content_list_seq])/len(content_list_seq)) content_list_seq_pad = pad_sequences(content_list_seq, maxlen=input_length) return df, content_list_seq_pad, 4, len(vocab)
def main(): ### read training and testing data tag_list = pickle.load(open("label_mapping.p", "rb")) (_, X_test, _) = read_data(test_path, False) ### tokenizer for all data tokenizer = Tokenizer() word_index = pickle.load(open("word_index.p", "rb")) tokenizer.word_index = word_index ### convert word sequences to index sequence test_sequences = tokenizer.texts_to_sequences(X_test) ### padding to equal length test_sequences = pad_sequences(test_sequences, maxlen=306) ### split data into training set and validation set model = load_model('best_model.hdf5', custom_objects={'f1_score': f1_score}) Y_pred = model.predict(test_sequences) thresh = 0.4 with open(output_path, 'w') as output: print('\"id\",\"tags\"', file=output) Y_pred_thresh = (Y_pred > thresh).astype('int') for index, labels in enumerate(Y_pred_thresh): labels = [ tag_list[i] for i, value in enumerate(labels) if value == 1 ] labels_original = ' '.join(labels) print('\"%d\",\"%s\"' % (index, labels_original), file=output)
def calculate_hiddenstate_after_encoder(self, sentence): self.__setup_model() tokenizer = Tokenizer() self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy') self.word_index = self.word_index.item() tokenizer.word_index = self.word_index self.num_words = self.params['MAX_WORDS'] + 3 tokenizer.num_words = self.num_words try: self.word_index[self.START_TOKEN] self.word_index[self.END_TOKEN] self.word_index[self.UNK_TOKEN] except Exception as e: print(e, "why") exit() sentence = tokenizer.texts_to_sequences([sentence]) sentence = [self.word_index[self.START_TOKEN]] + sentence[0] + [self.word_index[self.END_TOKEN]] sentence = pad_sequences([sentence], maxlen=self.params['max_seq_length'], padding='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) encoder_name = 'encoder' encoder = Model(inputs=self.M.input, outputs=self.M.get_layer(encoder_name).output) prediction = encoder.predict(sentence, batch_size=1) print(prediction.shape) return prediction
def tokenize_texts(self, corpus=None): print('[Preprocess] tokenize texts', flush=True) if corpus is None: corpus = self.corpus filters = '!"$%&()*+,-./:;<=>?@[\]^_`{|}~' tokenizer = Tokenizer(filters=filters) wi_path = self.model_dir + 'word_index.json' if not osp.exists(wi_path): print('[Preprocess] construct word index', flush=True) tokenizer.fit_on_texts(corpus) word_index = tokenizer.word_index with open(wi_path, 'w') as f: print('[Preprocess] save word index: ' + wi_path, flush=True) json.dump(word_index, f) else: with open(wi_path, 'r') as f: print('[Preprocess] load word index: ' + wi_path, flush=True) word_index = json.load(f) tokenizer.word_index = word_index train_Xi = tokenizer.texts_to_sequences(self.train_X) test_Xi = tokenizer.texts_to_sequences(self.test_X) self.train_Xi = pad_sequences(train_Xi) self.maxlen = self.train_Xi.shape[1] self.test_Xi = pad_sequences(test_Xi, maxlen=self.maxlen) self.word_index = word_index
def main(): print('==================================================================') print('Read test data and categories.') test_text = read_test(argv[1]) categories = np.load('categories.npy') print('==================================================================') print('Load tokenizer.') tokenizer = Tokenizer() tokenizer.word_index = np.load(argv[3][:-3] + '_word_index.npy').item() test_sequences = tokenizer.texts_to_sequences(test_text) test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LEN) print('Shape of test data:', test_data.shape) print('==================================================================') print('Load model.') model = load_model(argv[3], custom_objects={'f1_score': f1_score}) model.summary() print('Predict.') result = model.predict(test_data, verbose=1) print('==================================================================') print('Output result. threshold: %f' % THRESHOLD) output_result(argv[2], result, categories)
def tokenize_sequence(sentences, filters, max_num_words, word_index): """ Tokenizes a given input sequence of words. Args: sentences: List of sentences filters: List of filters/punctuations to omit (for Keras tokenizer) max_num_words: Number of words to be considered in the fixed length sequence max_vocab_size: Number of most frequently occurring words to be kept in the vocabulary Returns: x : List of padded/truncated indices created from list of sentences word_index: dictionary storing the word-to-index correspondence """ sentences = [' '.join(word_tokenize(s)[:max_num_words]) for s in sentences] tokenizer = Tokenizer(filters=filters, oov_token = True) tokenizer.word_index = word_index x = tokenizer.texts_to_sequences(list(sentences)) for i, seq in enumerate(x): if any(t == None for t in seq): seq = [t if t != None else word_index['UNK'] for t in seq] seq.append(word_index['EOS']) x[i] = seq x = pad_sequences(x, padding='post', truncating='post', maxlen=max_num_words, value=word_index['PAD']) return x
def encode_textdata(df_X_text, tokenizer, mode, max_words, maxlen): ## encode text columns, encoded text features should not be normalized. print('Starting to encode text inputs...') texts = df_X_text.iloc[:,0].values.astype('U') print('Found %s texts.' % len(texts)) if mode == 'tfidf': if tokenizer is None: tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(texts) X_text = tokenizer.texts_to_matrix(texts, mode='tfidf') print('tfidf X_text shape: {}'.format(X_text.shape)) elif mode == 'glove': # vectorize the text samples into a 2D integer tensor if tokenizer is None: tokenizer = Tokenizer(num_words=max_words, oov_token='<UNK>') tokenizer.fit_on_texts(texts) tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= max_words} # tokenizer.word_index[tokenizer.oov_token] = max_words + 1 sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) X_text = pad_sequences(sequences, maxlen=maxlen, padding='post') else: raise ValueError('Unknown text processing mode: {}'.format(mode)) return X_text, tokenizer ### need to save embedding_matrix as well
def main(): config = Config() char_pred_test_start, char_pred_test_end = create_input_data(config.models) df_test = pd.read_csv( '../input/tweet-sentiment-extraction/test.csv').fillna('') df_test['selected_text'] = '' tokenizer = Tokenizer(num_words=None, char_level=True, oov_token='UNK', lower=True) tokenizer.word_index = VOCAB len_voc = len(tokenizer.word_index) + 1 X_test = tokenizer.texts_to_sequences(df_test['text'].values) test_dataset = TweetCharDataset(df_test, X_test, char_pred_test_start, char_pred_test_end, max_len=config.max_len_val, train=False, n_models=config.n_models) pred_tests = k_fold_inference(config, test_dataset, len_voc, seed=42) np.save(f"preds_char_test_{config.model_name}.npy", np.array(pred_tests))
def predict(corpus_path, model_dir, embeddings): with open('word_index.pickle', 'rb') as fin: word_index = pickle.load(fin) with open('embedding_matrix.pickle', 'rb') as fin: embedding_matrix = pickle.load(fin) tweets, labels = get_data(corpus_path) tokenizer = Tokenizer() tokenizer.word_index = word_index sequences = tokenizer.texts_to_sequences(tweets) sequences = pad_sequences(sequences, maxlen=50) adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=10e-9, decay=0.0, amsgrad=False) model = get_model(model_dir, corpus_path, embedding_matrix) corpus_file = os.path.basename(corpus_path) weights_paths = os.listdir(model_dir) search = re.sub('2018-', '', corpus_file) search = re.sub('.txt', '', search) search = re.sub('-dev', '', search) #print(search) weights_file = [match for match in weights_paths if search in match][0] weights_path = os.path.join(model_dir, weights_file) #weights_path = os.path.join(model_dir, corpus_file.split('.')[0] + '.hdf5') model.load_weights(weights_path) model.compile(loss='mean_squared_error', optimizer=adam) predictions = model.predict(sequences) #print(pearsonr(labels.reshape(-1, 1), predictions)) return predictions
def dataset_preparation(data, num_words=None): """Prep the corpus text for training. Expect end tokens, start tokens, return tokens to already have been added. data--corpus of text num_words--max number of words for model to have. """ tokenizer = Tokenizer() #Want to have system have way to end a poem. So adding another end token. corpus = data.lower().replace('<endtoken>', '<endtoken2><endtoken>').split("<endtoken>") tokenizer.fit_on_texts(corpus) ###Generate list of words: #If max number of words given, find and use just those words: if num_words != None: tokenizer.word_index = { e: i for e, i in tokenizer.word_index.items() if i <= num_words } #make sure to still have out of vocabulary token: tokenizer.word_index[tokenizer.oov_token] = num_words + 1 total_words = len(tokenizer.word_index) + 1 ###Generate list of input sequences input_sequences = [] for line in corpus: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) max_sequence_len = max([len(x) for x in input_sequences]) input_sequences = np.array( pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) predictors, label = input_sequences[:, :-1], input_sequences[:, -1] label = ku.to_categorical(label, num_classes=total_words) return tokenizer, predictors, label, max_sequence_len, total_words
def __prepareFeatures(self, dataset: str, importIndexes=False): sources, languages = self.extractSources(dataset) # configs max_features: int = self.config['max_features'] max_len_sequences: int = self.config['max_len_sequences'] wordsIndexes: dict = {} tokenizer = Tokenizer(num_words=max_features, filters=TOKENIZER_CONFIG['filter'], oov_token='UNKNOWN') # tokenization if not importIndexes: tokenizer.fit_on_texts(sources) # export vocabulary self.exportVocabulary(tokenizer.word_index) else: # import vocabulary tokenizer.word_index = self.importVocabulary() # X + Y X = tokenizer.texts_to_sequences(sources) X = pad_sequences(X, maxlen=max_len_sequences) Y = languages return np.array(X), np.array(Y)
def main(): print('==================================================================') print('Read test data and categories.') test_text = read_test(argv[1]) categories = np.load('categories.npy') print('==================================================================') print('Predict') model_list = read_model_list('model_list.txt') nb_models = len(model_list) print('Total models: %d' % nb_models) sum_result = np.zeros([len(test_text), len(categories)]) sum_weight = 0 for (weight, name) in model_list: print('model: ' + name + ', weight: %f' % weight) tokenizer = Tokenizer() tokenizer.word_index = np.load(name[:-3] + '_word_index.npy').item() test_sequences = tokenizer.texts_to_sequences(test_text) test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LEN) model = load_model(name, custom_objects={'f1_score': f1_score}) sum_result += model.predict(test_data, verbose=0) sum_weight += weight sum_result /= sum_weight print('==================================================================') print('Output result. threshold: %f' % THRESHOLD) output_result(argv[2], sum_result, categories, THRESHOLD)
def get_dict(sentences, filters, max_num_words, max_vocab_size): sentences = [' '.join(word_tokenize(s)[:max_num_words]) for s in sentences] tokenizer = Tokenizer(filters=filters) tokenizer.fit_on_texts(sentences) word_index = dict() word_index['PAD'] = 0 word_index['UNK'] = 1 word_index['GO'] = 2 word_index['EOS'] = 3 for i, word in enumerate(dict(tokenizer.word_index).keys()): word_index[word] = i + 4 tokenizer.word_index = word_index x = tokenizer.texts_to_sequences(list(sentences)) for i, seq in enumerate(x): if any(t >= max_vocab_size for t in seq): seq = [t if t < max_vocab_size else word_index['UNK'] for t in seq] seq.append(word_index['EOS']) x[i] = seq x = pad_sequences(x, padding='post', truncating='post', maxlen=max_num_words, value=word_index['PAD']) word_index = {k: v for k, v in word_index.items() if v < max_vocab_size} return word_index
def tokenizer_from_json(json_string): """Parses a JSON tokenizer configuration file and returns a tokenizer instance. # Arguments json_string: JSON string encoding a tokenizer configuration. # Returns A Keras Tokenizer instance """ tokenizer_config = json.loads(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer
def main(): (_, X_test, _) = read_data(sys.argv[1], False) tokenizer = Tokenizer() tokenizer.word_index = pickle.load(open('bow_word_index.pickle', 'rb')) test_bag = tokenizer.texts_to_matrix(X_test, 'count') model = Sequential() model.add(Dense(512, activation='relu', input_dim=51867)) model.add(Dropout(0.5)) model.add(Dense(512, activation='relu')) model.add(Dropout(0.6)) model.add(Dense(512, activation='relu')) model.add(Dropout(0.8)) model.add(Dense(38, activation='sigmoid')) model.load_weights('bow.hdf5') Y_pred = model.predict(test_bag) thresh = 0.4 tag_list = pickle.load(open('label_mapping.pickle', 'rb')) with open(sys.argv[2], 'w') as output: print('\"id\",\"tags\"', file=output) Y_pred_thresh = (Y_pred > thresh).astype('int') for index, labels in enumerate(Y_pred_thresh): labels = [ tag_list[i] for i, value in enumerate(labels) if value == 1 ] labels_original = ' '.join(labels) print('\"%d\",\"%s\"' % (index, labels_original), file=output)
def generate_sequences_from_texts(texts, indices_list, textgen, batch_size=128): """ Generates sequences from the given texts based on the selected configuration """ is_words = textgen.config['word_level'] is_single = textgen.config['single_text'] max_length = textgen.config['max_length'] meta_token = textgen.META_TOKEN if is_words: new_tokenizer = Tokenizer(filters='', char_level=True) new_tokenizer.word_index = textgen.vocab else: new_tokenizer = textgen.tokenizer while True: np.random.shuffle(indices_list) X_batch = [] Y_batch = [] count_batch = 0 for row in range(indices_list.shape[0]): text_index = indices_list[row, 0] end_index = indices_list[row, 1] text = texts[text_index] if not is_single: text = [meta_token] + list(text) + [meta_token] if end_index > max_length: x = text[end_index - max_length:end_index + 1] else: x = text[0:end_index + 1] y = text[end_index + 1] if y in textgen.vocab: x = process_sequence([x], textgen, new_tokenizer) y = text_generation_encode_cat([y], textgen.vocab) X_batch.append(x) Y_batch.append(y) count_batch += 1 if count_batch % batch_size == 0: X_batch = np.squeeze(np.array(X_batch)) Y_batch = np.squeeze(np.array(Y_batch)) yield (X_batch, Y_batch) X_batch = [] Y_batch = [] count_batch = 0
def predict_batch(self, sentences): self.__setup_model() tokenizer = Tokenizer() self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy') self.word_index = self.word_index.item() tokenizer.word_index = self.word_index self.num_words = self.params['MAX_WORDS'] + 3 tokenizer.num_words = self.num_words try: self.word_index[self.START_TOKEN] self.word_index[self.END_TOKEN] self.word_index[self.UNK_TOKEN] except Exception as e: print(e, "why") exit() sentences = tokenizer.texts_to_sequences(sentences) mod_sentences = [] for sentence in sentences: mod_sentences.append([self.word_index[self.START_TOKEN]] + sentence + [self.word_index[self.END_TOKEN]]) sentences = pad_sequences(mod_sentences, maxlen=self.params['max_seq_length'], padding='post') sentences = sentences.reshape(sentences.shape[0], sentences.shape[1]) batch_size = sentences.shape[0] if batch_size > 10: batch_size = 10 reverse_word_index = dict((i, word) for word, i in self.word_index.items()) predicted_sentences = [] from_idx = 0 to_idx = batch_size while True: print("from_idx, to_idx, hm_sentences", from_idx, to_idx, sentences.shape[0]) current_batch = sentences[from_idx:to_idx] prediction = self.M.predict(current_batch, batch_size=batch_size) for sentence in prediction: predicted_sent = "" for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print("second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sent += next_word + " " predicted_sentences.append(predicted_sent) from_idx += batch_size to_idx += batch_size if to_idx > sentences.shape[0]: # todo nicht multiple von batchsize trotzdem predicten break return predicted_sentences
def get_padded_dataset(dataset): labels = [x['label'] for x in dataset] data = [x['sentence'] for x in dataset] # Preprocessing text tokenizer = Tokenizer() tokenizer.word_index = word_index data_seqs = tokenizer.texts_to_sequences(data) data_seqs_padded = pad_sequences(data_seqs, maxlen=MAX_SEQUENCE_LENGTH) labels = np.array(labels) return (data_seqs_padded, labels)
def prepare_tokenizer(): tk = Tokenizer(char_level=True) alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" char_dict = {} for i, char in enumerate(alphabet): char_dict[char] = i + 1 tk.word_index = char_dict.copy() tk.word_index[tk.oov_token] = max(char_dict.values()) + 1 return tk
def tokenize(sentence, dictionary): num_words = len(dictionary) tokenizer = Tokenizer(num_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ 1234567890') tokenizer.word_index = dictionary max_words = 35 # sentence_vec = text_to_word_sequence(sentence) sentence_vec = tokenizer.texts_to_sequences([sentence]) print(sentence_vec) sentence_vec = sequence.pad_sequences(sentence_vec, maxlen=max_words) return sentence_vec
def tokenize(dic, data): # create a tokenizer and feed in word index t = Tokenizer(num_words=None, lower=True, split=' ') t.word_index = dic # convert words from each call transcription into an index array allWords = [] transcriptions = data['Words'] for text in transcriptions: words = convert_text_to_index_array(text, dic) allWords.append(words) # convert index array into a matrix and return it return t.sequences_to_matrix(allWords, mode='binary')
def calculate_hiddenstate_after_encoder(self, sentence): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentence) sentence = en_tokenizer.texts_to_sequences([sentence]) print(sentence) sentence = pad_sequences(sentence, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) print(sentence) encoder_name = 'encoder' encoder = Model(inputs=self.M.input, outputs=self.M.get_layer(encoder_name).output) prediction = encoder.predict(sentence, batch_size=1) print(prediction.shape) return prediction
def train(corpus_file, model_dir, embeddings, affect_lexicon, fresh_run=False, data_dir=None, epochs=3, batch_size=64, val_split=0.2): if not os.path.exists(model_dir): os.makedirs(model_dir) if fresh_run: word_index = prepare_word_index(data_dir) embedding_index = prepare_embedding_index(embeddings) embedding_matrix = prepare_embedding_matrix(word_index, embedding_index) affect_index = prepare_affect_index(affect_lexicon) affect_matrix = prepare_affect_matrix(word_index, affect_index) else: with open('word_index.pickle', 'rb') as fin: word_index = pickle.load(fin) with open('embedding_index.pickle', 'rb') as fin: embedding_index = pickle.load(fin) with open('embedding_matrix.pickle', 'rb') as fin: embedding_matrix = pickle.load(fin) with open('affect_index.pickle', 'rb') as fin: affect_index = pickle.load(fin) with open('affect_matrix.pickle', 'rb') as fin: affect_matrix = pickle.load(fin) tweets, labels = get_data(corpus_file) print(tweets[:5]) print(labels[:5]) tokenizer = Tokenizer() tokenizer.word_index = word_index sequences = tokenizer.texts_to_sequences(tweets) sequences = pad_sequences(sequences, maxlen=50) x_train, x_val, y_train, y_val = train_test_split(sequences, labels, test_size=val_split, random_state=42) scores, models = grid_search(x_train, y_train, x_val, y_val, architectures, param_grid, 5, corpus_file, embedding_matrix, model_dir) with open('history.pickle', 'wb') as fout: pickle.dump((scores, models), fout)
def kerasTokenizer(balanced_texts, max_sentence_length, topbestwords): global vector_dim vector_dim = max_sentence_length global top_words top_words = topbestwords tokenizer = Tokenizer(num_words=topbestwords) tokenizer.fit_on_texts(balanced_texts) sequences = tokenizer.texts_to_sequences(balanced_texts) data = pad_sequences(sequences, maxlen=max_sentence_length, padding='pre') # print(data[:2]) tokenizer.word_index = OrderedDict( sorted(tokenizer.word_index.items(), key=lambda t: t[1])) return data, tokenizer.word_index
def main(): ### read training and testing data tag_list = pickle.load(open(tags_path, 'rb')) (_, X_test,_) = read_data(test_path,False) all_corpus = pickle.load(open(corpus_path, 'rb')) print ('Find %d articles.' %(len(all_corpus))) ### tokenizer for all data tokenizer = Tokenizer() tokenizer.fit_on_texts(all_corpus) word_index = pickle.load(open(wIndex_path, 'rb')) tokenizer.word_index = word_index ### convert word sequences to index sequence print ('Convert to index sequences.') #train_sequences = tokenizer.texts_to_matrix(X_data, mode = 'tfidf') test_sequences = tokenizer.texts_to_matrix(X_test, mode = 'tfidf') ### padding to equal length print ('Padding sequences.') #train_sequences = pad_sequences(train_sequences) #max_article_length = train_sequences.shape[1] test_sequences = pad_sequences(test_sequences,maxlen=51867) ### build model print ('Building model.') model = Sequential() model.add(Dense(input_dim=51867, units=480,activation='relu')) model.add(Dropout(0.15)) model.add(Dense(512,activation='relu')) model.add(Dropout(0.15)) model.add(Dense(512,activation='relu')) model.add(Dropout(0.15)) model.add(Dense(512,activation='relu')) model.add(Dropout(0.15)) model.add(Dense(512,activation='relu')) model.add(Dropout(0.15)) model.add(Dense(38,activation='sigmoid')) model.summary() model.load_weights(weight_path) Y_pred = model.predict(test_sequences) thresh = threshold with open(output_path,'w') as output: print ('\"id\",\"tags\"',file=output) Y_pred_thresh = (Y_pred > thresh).astype('int') for index,labels in enumerate(Y_pred_thresh): labels = [tag_list[i] for i,value in enumerate(labels) if value==1 ] labels_original = ' '.join(labels) print ('\"%d\",\"%s\"'%(index,labels_original),file=output)
def load_tokenizer_from_file(filename): tokenizer = Tokenizer() with open(filename, 'r') as infile: tokenizer_data = json.load(infile) tokenizer.word_counts = OrderedDict(tokenizer_data['word_counts']) tokenizer.word_docs = tokenizer_data['word_docs'] tokenizer.word_index = tokenizer_data['word_index'] tokenizer.document_count = tokenizer_data['document_count'] tokenizer.index_docs = tokenizer_data['index_docs'] return tokenizer
def structure_data(path='agnews_data'): texts, labels = create_dataset(path) tok = Tokenizer(char_level=True, split='') tok.fit_on_texts(texts) tok.word_index = char_index sequences = tok.texts_to_sequences(texts) padding = pad_sequences(sequences, maxlen=1014, padding='post') padding = np.array(padding) labels = to_categorical(labels) print('Annotations done and Data is ready to be fed to the network') return padding, labels
def generate_sequences_from_texts(texts, indices_list, textgenrnn, context_labels, batch_size=128): is_words = textgenrnn.config['word_level'] is_single = textgenrnn.config['single_text'] max_length = textgenrnn.config['max_length'] meta_token = textgenrnn.META_TOKEN if is_words: new_tokenizer = Tokenizer(filters='', char_level=True) new_tokenizer.word_index = textgenrnn.vocab else: new_tokenizer = textgenrnn.tokenizer while True: np.random.shuffle(indices_list) X_batch = [] Y_batch = [] context_batch = [] count_batch = 0 for row in range(indices_list.shape[0]): text_index = indices_list[row, 0] end_index = indices_list[row, 1] text = texts[text_index] if not is_single: text = [meta_token] + list(text) + [meta_token] if end_index > max_length: x = text[end_index - max_length: end_index + 1] else: x = text[0: end_index + 1] y = text[end_index + 1] if y in textgenrnn.vocab: x = process_sequence([x], textgenrnn, new_tokenizer) y = textgenrnn_encode_cat([y], textgenrnn.vocab) X_batch.append(x) Y_batch.append(y) if context_labels is not None: context_batch.append(context_labels[text_index]) count_batch += 1 if count_batch % batch_size == 0: X_batch = np.squeeze(np.array(X_batch)) Y_batch = np.squeeze(np.array(Y_batch)) context_batch = np.squeeze(np.array(context_batch)) # print(X_batch.shape) if context_labels is not None: yield ([X_batch, context_batch], [Y_batch, Y_batch]) else: yield (X_batch, Y_batch) X_batch = [] Y_batch = [] context_batch = [] count_batch = 0