def parse_camera_lines(lines): """ Reads a camera file, returning a single ViewSequence (without images). Args: lines: [N] string tensor of camera lines Returns: The corresponding length N sequence, as a ViewSequence. """ # The first line contains the YouTube video URL. # Format of each subsequent line: timestamp fx fy px py k1 k2 row0 row1 row2 # Column number: 0 1 2 3 4 5 6 7-10 11-14 15-18 youtube_url = lines[0] record_defaults = ([['']] + [[0.0]] * 18) data = tf.decode_csv(lines[1:], record_defaults, field_delim=' ') with tf.control_dependencies([ # We don't accept non-zero k1 and k2. tf.assert_equal(data[5:6], 0.0) ]): timestamps = data[0] intrinsics = tf.stack(data[1:5], axis=1) poses = utils.build_matrix([data[7:11], data[11:15], data[15:19]]) # No image data yet. Ideally we'd put "None" for image, but the dataset # API doesn't allow that, so we use zeros instead. images = tf.zeros_like(timestamps, dtype=tf.float32) # In camera files, the video id is the last part of the YouTube URL, it comes # after the =. It seems hacky to use decode_csv but it's easier than # string_split because that returns a sparse tensor. youtube_id = tf.decode_csv([youtube_url], [[''], ['']], field_delim='=')[1][0] return RealEstateViewSequence(youtube_id, timestamps, intrinsics, poses, images)
def clip_embedding_matrix(embedding_file, input_files, output_dir, embedding_name): vocab_file = os.path.join(output_dir, 'vocab.txt') clipped_file = os.path.join(output_dir, embedding_name) # load all files and build the vocabulary all_texts = load_all_texts(input_files) tokenizer = Tokenizer(num_words=None, lower=False) tokenizer.fit_on_texts(all_texts) logger.info("the size of vocabulary is {}".format( len(tokenizer.word_counts))) # load word vector and build embedding matrix embeddings_index = load_embedding(embedding_file) embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index) logger.info("the shape of embedding matrix is {}".format( embedding_matrix.shape)) # save embedding matrix and vocabulary np.save(clipped_file, embedding_matrix) # save embedding matrix # save vocabulary words = [word + '\n' for word in list(tokenizer.word_index.keys())] with open(vocab_file, 'w', encoding='utf-8') as f: f.writelines(words)
epochs = 14 batch_size = 521 model.fit ( [ input_train , topic_train ] , out_train , epochs = epochs , batch_size = batch_size , verbose = 1 , \ validation_data = ([ input_val , topic_val ] , out_val),callbacks = callbacks_list, ) model.summary() #model.save ( 'inner_att_bilstm_cos.h5' ) if __name__ == "__main__": # Input data files are available in the "../input/" directory. input_path = 'input/' sentenceLength = 150 input_train, topic_train, out_train, input_val, topic_val, out_val = read_input( input_path) (input_train, input_val, topic_train, topic_val, word_index) = text_precocess(input_train, input_val, topic_train, topic_val) embeddings_index = load_embeddings() embedding_matrix = build_matrix(word_index, embeddings_index) model = build_model(sentenceLength, word_index, verbose=False, compile=True) train_model(model, input_train, topic_train, out_train, input_val, topic_val, out_val) ### to reuse the model ###t_model = load_model('inner_att-bilstm_cos.h5',custom_objects={'Attention':Attention})
def train_split_aug(): df = pd.read_csv('new_processed_data/train_tok.csv') iden_df = pd.read_csv('processed_data/train_tok_iden.csv') iden_aug_df = pd.read_csv('new_processed_data/train_iden_last.csv') toxic_aug_df = pd.read_csv('new_processed_data/train_back_toxic.csv') ### text a_texts = df['comment_text'].values i_texts = iden_aug_df['comment_text'].values t_texts = toxic_aug_df['comment_text'].values texts = np.concatenate([a_texts, i_texts, t_texts]) ### label a_label = df['target'].values i_label = iden_aug_df['toxic'].values t_label = toxic_aug_df['toxic'].values labels = np.concatenate([a_label, i_label, t_label]) ### aux label a_aux = df[AUX_COLUMNS].values i_aux = iden_aug_df[AUX_AUG_COLUMNS].values t_aux = toxic_aug_df[AUX_AUG_COLUMNS].values aux = np.concatenate([a_aux, i_aux, t_aux]) ### idts val_idts = df[IDENTITY_COLUMNS].fillna(0).values a_idts = iden_df[IDENTITY_COLUMNS].fillna(0).values i_idts = iden_aug_df[IDENTITY_COLUMNS].fillna(0).values t_idts = toxic_aug_df[IDENTITY_COLUMNS].fillna(0).values idts = np.concatenate([a_idts, i_idts, t_idts]) del df del iden_df del iden_aug_df del toxic_aug_df tokenizer = text.Tokenizer(filters='', lower=False) tokenizer.fit_on_texts(list(texts)) texts = tokenizer.texts_to_sequences(texts) texts = [t[:1024] for t in texts] crawl_matrix, unknown_words_crawl = build_matrix( tokenizer.word_index, 'embedding/crawl-300d-2M.pkl') print('n unknown words (crawl): ', len(unknown_words_crawl)) glove_matrix, unknown_words_glove = build_matrix( tokenizer.word_index, 'embedding/glove.840B.300d.pkl') print('n unknown words (glove): ', len(unknown_words_glove)) max_features = len(tokenizer.word_index) + 1 print('Vocab Size:', max_features) embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1) print('Embedding shape:', embedding_matrix.shape) del crawl_matrix del glove_matrix gc.collect() import pickle pickle.dump(embedding_matrix, open('new_processed_data/aug_emb.pkl', 'wb')) pickle.dump(tokenizer.word_index, open('new_processed_data/aug_word_index.pkl', 'wb')) pickle.dump(texts, open('new_processed_data/aug_texts.pkl', 'wb')) train_ind, val_ind = train_test_split(range(len(a_texts)), random_state=59, test_size=0.055) train_texts = [texts[i] for i in train_ind] + texts[len(a_texts):] val_texts = [texts[i] for i in val_ind] train_labels, val_labels = np.concatenate( [labels[train_ind], labels[len(a_texts):]]), labels[val_ind] train_aux_labels = np.concatenate([aux[train_ind], aux[len(a_texts):]]) train_iden, val_iden = np.concatenate( [idts[train_ind], idts[len(a_texts):]]), val_idts[val_ind] train_weight = get_weights_new_array(train_iden, train_labels) lw = 1 / np.mean(train_weight) train_gen = GeneralDataGenerator( inputs=[train_texts], outputs=[train_labels, train_aux_labels], sample_weights=[train_weight, np.ones_like(train_weight)], batch_size=512) val_gen = GeneralPredictGenerator(text=val_texts, batch_size=512) model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) opt = Adam(1e-3) model.compile(loss='binary_crossentropy', optimizer=opt, loss_weights=[lw, 1.]) model.summary() EMAer = ExponentialMovingAverage(model) EMAer.inject() logger = KFoldLogger('lstm_dp0.5_ema_aug', val_gen, val_true=val_labels, val_iden=val_iden, patience=10, lr_patience=5) model.fit_generator(train_gen.__iter__(), len(train_gen), epochs=15, callbacks=[logger], verbose=1)
def get_test_data(num=100): points = np.random.normal(size=(num, 2)) matrix = build_matrix(points) return points, matrix
]].reset_index(drop=True) train_raw = mz.pack(train_df, task) val_raw = mz.pack(val_df, task) train_processed = preprocessor.transform(train_raw) val_processed = preprocessor.transform(val_raw) train_processed.save("5fold/train_processed_{}.dp".format(i)) val_processed.save("5fold/val_processed_{}.dp".format(i)) test_recall[feature] = test_recall[feature] / norm_df test_recall['feature'] = list(test_recall[feature].values) test_recall = test_recall[[ 'id_left', 'text_left', 'id_right', 'text_right', 'feature' ]] test_raw = mz.pack(test_recall, task) test_processed = preprocessor.transform(test_raw) # test_processed.save("test_processed.dp") test_processed.save("final_test_processed.dp") from gensim.models import KeyedVectors w2v_path = "data/glove.w2v" w2v_model = KeyedVectors.load_word2vec_format(w2v_path, binary=False) term_index = preprocessor.context['vocab_unit'].state['term_index'] embedding_matrix = build_matrix(term_index, w2v_model) del w2v_model, term_index gc.collect() np.save("data/embedding_matrix.npy", embedding_matrix)