Beispiel #1
0
def parse_camera_lines(lines):
  """
  Reads a camera file, returning a single ViewSequence (without images).
  Args:
    lines: [N] string tensor of camera lines

  Returns:
    The corresponding length N sequence, as a ViewSequence.
  """
  # The first line contains the YouTube video URL.
  # Format of each subsequent line: timestamp fx fy px py k1 k2 row0 row1  row2
  # Column number:                  0         1  2  3  4  5  6  7-10 11-14 15-18
  youtube_url = lines[0]
  record_defaults = ([['']] + [[0.0]] * 18)
  data = tf.decode_csv(lines[1:], record_defaults, field_delim=' ')

  with tf.control_dependencies([
      # We don't accept non-zero k1 and k2.
      tf.assert_equal(data[5:6], 0.0)
  ]):
    timestamps = data[0]
    intrinsics = tf.stack(data[1:5], axis=1)
    poses = utils.build_matrix([data[7:11], data[11:15], data[15:19]])

  # No image data yet. Ideally we'd put "None" for image, but the dataset
  # API doesn't allow that, so we use zeros instead.
  images = tf.zeros_like(timestamps, dtype=tf.float32)

  # In camera files, the video id is the last part of the YouTube URL, it comes
  # after the =. It seems hacky to use decode_csv but it's easier than
  # string_split because that returns a sparse tensor.
  youtube_id = tf.decode_csv([youtube_url], [[''], ['']], field_delim='=')[1][0]
  return RealEstateViewSequence(youtube_id, timestamps, intrinsics, poses, images)
def clip_embedding_matrix(embedding_file, input_files, output_dir,
                          embedding_name):
    vocab_file = os.path.join(output_dir, 'vocab.txt')
    clipped_file = os.path.join(output_dir, embedding_name)

    # load all files and build the vocabulary
    all_texts = load_all_texts(input_files)
    tokenizer = Tokenizer(num_words=None, lower=False)
    tokenizer.fit_on_texts(all_texts)
    logger.info("the size of vocabulary is {}".format(
        len(tokenizer.word_counts)))

    # load word vector and build embedding matrix
    embeddings_index = load_embedding(embedding_file)
    embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index)
    logger.info("the shape of embedding matrix is {}".format(
        embedding_matrix.shape))

    # save embedding matrix and vocabulary
    np.save(clipped_file, embedding_matrix)  # save embedding matrix
    # save vocabulary
    words = [word + '\n' for word in list(tokenizer.word_index.keys())]
    with open(vocab_file, 'w', encoding='utf-8') as f:
        f.writelines(words)
Beispiel #3
0
    epochs = 14
    batch_size = 521
    model.fit ( [ input_train , topic_train ] , out_train , epochs = epochs , batch_size = batch_size , verbose = 1 , \
                validation_data = ([ input_val , topic_val ] , out_val),callbacks = callbacks_list, )
    model.summary()
    #model.save ( 'inner_att_bilstm_cos.h5' )


if __name__ == "__main__":

    # Input data files are available in the "../input/" directory.
    input_path = 'input/'
    sentenceLength = 150
    input_train, topic_train, out_train, input_val, topic_val, out_val = read_input(
        input_path)
    (input_train, input_val, topic_train, topic_val,
     word_index) = text_precocess(input_train, input_val, topic_train,
                                  topic_val)
    embeddings_index = load_embeddings()
    embedding_matrix = build_matrix(word_index, embeddings_index)

    model = build_model(sentenceLength,
                        word_index,
                        verbose=False,
                        compile=True)
    train_model(model, input_train, topic_train, out_train, input_val,
                topic_val, out_val)

### to reuse the model
###t_model = load_model('inner_att-bilstm_cos.h5',custom_objects={'Attention':Attention})
Beispiel #4
0
def train_split_aug():
    df = pd.read_csv('new_processed_data/train_tok.csv')
    iden_df = pd.read_csv('processed_data/train_tok_iden.csv')
    iden_aug_df = pd.read_csv('new_processed_data/train_iden_last.csv')
    toxic_aug_df = pd.read_csv('new_processed_data/train_back_toxic.csv')

    ### text
    a_texts = df['comment_text'].values
    i_texts = iden_aug_df['comment_text'].values
    t_texts = toxic_aug_df['comment_text'].values
    texts = np.concatenate([a_texts, i_texts, t_texts])

    ### label
    a_label = df['target'].values
    i_label = iden_aug_df['toxic'].values
    t_label = toxic_aug_df['toxic'].values
    labels = np.concatenate([a_label, i_label, t_label])

    ### aux label
    a_aux = df[AUX_COLUMNS].values
    i_aux = iden_aug_df[AUX_AUG_COLUMNS].values
    t_aux = toxic_aug_df[AUX_AUG_COLUMNS].values
    aux = np.concatenate([a_aux, i_aux, t_aux])

    ### idts
    val_idts = df[IDENTITY_COLUMNS].fillna(0).values
    a_idts = iden_df[IDENTITY_COLUMNS].fillna(0).values
    i_idts = iden_aug_df[IDENTITY_COLUMNS].fillna(0).values
    t_idts = toxic_aug_df[IDENTITY_COLUMNS].fillna(0).values
    idts = np.concatenate([a_idts, i_idts, t_idts])

    del df
    del iden_df
    del iden_aug_df
    del toxic_aug_df

    tokenizer = text.Tokenizer(filters='', lower=False)
    tokenizer.fit_on_texts(list(texts))
    texts = tokenizer.texts_to_sequences(texts)
    texts = [t[:1024] for t in texts]

    crawl_matrix, unknown_words_crawl = build_matrix(
        tokenizer.word_index, 'embedding/crawl-300d-2M.pkl')
    print('n unknown words (crawl): ', len(unknown_words_crawl))

    glove_matrix, unknown_words_glove = build_matrix(
        tokenizer.word_index, 'embedding/glove.840B.300d.pkl')
    print('n unknown words (glove): ', len(unknown_words_glove))

    max_features = len(tokenizer.word_index) + 1
    print('Vocab Size:', max_features)

    embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
    print('Embedding shape:', embedding_matrix.shape)

    del crawl_matrix
    del glove_matrix
    gc.collect()

    import pickle
    pickle.dump(embedding_matrix, open('new_processed_data/aug_emb.pkl', 'wb'))
    pickle.dump(tokenizer.word_index,
                open('new_processed_data/aug_word_index.pkl', 'wb'))
    pickle.dump(texts, open('new_processed_data/aug_texts.pkl', 'wb'))

    train_ind, val_ind = train_test_split(range(len(a_texts)),
                                          random_state=59,
                                          test_size=0.055)

    train_texts = [texts[i] for i in train_ind] + texts[len(a_texts):]
    val_texts = [texts[i] for i in val_ind]

    train_labels, val_labels = np.concatenate(
        [labels[train_ind], labels[len(a_texts):]]), labels[val_ind]
    train_aux_labels = np.concatenate([aux[train_ind], aux[len(a_texts):]])
    train_iden, val_iden = np.concatenate(
        [idts[train_ind], idts[len(a_texts):]]), val_idts[val_ind]

    train_weight = get_weights_new_array(train_iden, train_labels)
    lw = 1 / np.mean(train_weight)

    train_gen = GeneralDataGenerator(
        inputs=[train_texts],
        outputs=[train_labels, train_aux_labels],
        sample_weights=[train_weight, np.ones_like(train_weight)],
        batch_size=512)
    val_gen = GeneralPredictGenerator(text=val_texts, batch_size=512)

    model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS))

    opt = Adam(1e-3)

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  loss_weights=[lw, 1.])
    model.summary()

    EMAer = ExponentialMovingAverage(model)
    EMAer.inject()

    logger = KFoldLogger('lstm_dp0.5_ema_aug',
                         val_gen,
                         val_true=val_labels,
                         val_iden=val_iden,
                         patience=10,
                         lr_patience=5)

    model.fit_generator(train_gen.__iter__(),
                        len(train_gen),
                        epochs=15,
                        callbacks=[logger],
                        verbose=1)
Beispiel #5
0
def get_test_data(num=100):
    points = np.random.normal(size=(num, 2))
    matrix = build_matrix(points)

    return points, matrix
    ]].reset_index(drop=True)

    train_raw = mz.pack(train_df, task)
    val_raw = mz.pack(val_df, task)

    train_processed = preprocessor.transform(train_raw)
    val_processed = preprocessor.transform(val_raw)

    train_processed.save("5fold/train_processed_{}.dp".format(i))
    val_processed.save("5fold/val_processed_{}.dp".format(i))

test_recall[feature] = test_recall[feature] / norm_df
test_recall['feature'] = list(test_recall[feature].values)
test_recall = test_recall[[
    'id_left', 'text_left', 'id_right', 'text_right', 'feature'
]]

test_raw = mz.pack(test_recall, task)
test_processed = preprocessor.transform(test_raw)
# test_processed.save("test_processed.dp")
test_processed.save("final_test_processed.dp")

from gensim.models import KeyedVectors
w2v_path = "data/glove.w2v"
w2v_model = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
term_index = preprocessor.context['vocab_unit'].state['term_index']
embedding_matrix = build_matrix(term_index, w2v_model)
del w2v_model, term_index
gc.collect()
np.save("data/embedding_matrix.npy", embedding_matrix)