def transform_data(data): if cfg.do_preprocess: data = preprocess(data) sentences = data["comment_text"].fillna("_NAN_").values # update word dict tokenized_sentences, _ = tc.tokenize_sentences(sentences, words_dict) coverage(tokenized_sentences, embedding_word_dict) sequences = tc.tokenized_sentences2seq(tokenized_sentences, words_dict) list_of_token_ids = tc.convert_tokens_to_ids(sequences, embedding_word_dict) X = np.array(list_of_token_ids) return X
def generate_caption(image_path, max_tokens=30): """ Generate a caption for the image in the given path. The caption is limited to the given number of tokens (words). """ image = preprocess(image_path) transfer_value = image_model_transfer.predict(image) shape = (1, max_tokens) decoder_input_data_1 = np.zeros(shape=shape, dtype=np.int) token_int = vocab_to_int['<GO>'] # Initialize an empty output-text. output_text = '' count_tokens = 0 while token_int != vocab_to_int['<EOS>'] and count_tokens < max_tokens: decoder_input_data_1[0, count_tokens] = token_int x_data = { 'transfer_values_input': transfer_value, 'decoder_input': decoder_input_data_1 } decoder_output = decoder_model.predict(x_data) token_onehot = decoder_output[0, count_tokens, :] token_int = np.argmax(token_onehot) sampled_word = int_to_vocab[token_int] output_text += " " + sampled_word count_tokens += 1 print("Predicted caption:") print(output_text) print()
write_config(Config) if not os.path.exists(cfg.root + cfg.model_name): os.mkdir(cfg.root + cfg.model_name) train = pd.read_csv(cfg.train_fn, index_col=0) test = pd.read_csv(TEST_FILENAME, index_col=0) subm = pd.read_csv(SAMPLE_SUBMISSION_FILENAME) train['none'] = 1 - train[LIST_CLASSES].max(axis=1) train[COMMENT].fillna(NAN_WORD, inplace=True) test[COMMENT].fillna(NAN_WORD, inplace=True) if cfg.do_preprocess: train = preprocess(train) test = preprocess(test) def pr(y_i, y, feature): p = feature[y == y_i].sum(0) return (p + 1) / ((y == y_i).sum() + 1) fold_size = train.shape[0] // cfg.fold_count X = train[COMMENT] Y = train[LIST_CLASSES] preds_test_list = [] preds_valid_list = [] list_of_y = []
valid_data = pd.read_csv(VALID_DATA_FN, index_col=1) train_data = pd.read_csv(TRAIN_DATA_FN) Y_train = train_data[LIST_CLASSES].values if cfg.do_preprocess == 'True': # if cfg.add_polarity == 'True': # print('preprocessing test (w polarity)') # test_data = preprocess(test_data,add_polarity=True) # print('preprocessing train (w polarity)') # train_data = preprocess(train_data,add_polarity=True) # print('preprocessing valid (w polarity)') # valid_data = preprocess(valid_data,add_polarity=True) # else: glove = cfg.glove == 'True' print('preprocessing test') test_data = preprocess(test_data, glove=glove) print('preprocessing train') train_data = preprocess(train_data,glove=glove) print('preprocessing valid') valid_data = preprocess(valid_data,glove=glove) if cfg.level == 'word': #X_test = transform_data(test_data) #X_valid = transform_data(valid_data) #X_train = transform_data(train_data) sentences_test = test_data["comment_text"].fillna("_NAN_").values sentences_train = train_data["comment_text"].fillna("_NAN_").values sentences_valid = valid_data["comment_text"].fillna("_NAN_").values # update word dict tokenized_sentences_train, tokenized_sentences_valid, tokenized_sentences_test = tc.tokenize_list_of_sentences(
fold_count = 1 cfg = Config() train_data = pd.read_csv(cfg.train_fn) # t, v = train_test_split(train_data,test_size=0.2, random_state=123) # t.to_csv("assets/raw_data/bagging_train.csv") # v.to_csv("assets/raw_data/bagging_valid.csv") #valid_data = pd.read_csv(VALID_SLIM_FILENAME) test_data = pd.read_csv(TEST_FILENAME) tc = ToxicComments(cfg) if tc.cfg.do_preprocess: if tc.cfg.add_polarity: train_data = preprocess(train_data, add_polarity=True) #valid_data = preprocess(valid_data,add_polarity=True) test_data = preprocess(test_data, add_polarity=True) else: train_data = preprocess(train_data) #valid_data = preprocess(valid_data) test_data = preprocess(test_data) sentences_train = train_data["comment_text"].fillna("_NAN_").values #sentences_valid = valid_data["comment_text"].fillna("_NAN_").values sentences_test = test_data["comment_text"].fillna("_NAN_").values Y = train_data[LIST_CLASSES].values if 'word' in tc.cfg.level: #tokenized_sentences_train, tokenized_sentences_valid,tokenized_sentences_test = tc.tokenize_list_of_sentences([sentences_train,sentences_valid,sentences_test]) tokenized_sentences_train, tokenized_sentences_test = tc.tokenize_list_of_sentences(
def train_folds(fold_count=10): #train_data = pd.read_csv("assets/raw_data/bagging_train.csv") train_data = pd.read_csv('train_e0.csv') #train_data = pd.read_csv(TRAIN_SLIM_FILENAME) # t, v = train_test_split(train_data,test_size=0.2, random_state=123) # t.to_csv("assets/raw_data/bagging_train.csv") # v.to_csv("assets/raw_data/bagging_valid.csv") valid_data = pd.read_csv(VALID_SLIM_FILENAME) test_data = pd.read_csv(TEST_FILENAME) cfg = Config() tc = ToxicComments(cfg) if tc.cfg.do_preprocess: if tc.cfg.add_polarity: train_data = preprocess(train_data,add_polarity=True) valid_data = preprocess(valid_data,add_polarity=True) test_data = preprocess(test_data, add_polarity=True) else: train_data = preprocess(train_data) valid_data = preprocess(valid_data) test_data = preprocess(test_data) sentences_train = train_data["comment_text"].fillna("_NAN_").values sentences_valid = valid_data["comment_text"].fillna("_NAN_").values sentences_test = test_data["comment_text"].fillna("_NAN_").values Y = train_data[LIST_CLASSES].values if tc.cfg.level == 'word': tokenized_sentences_train, tokenized_sentences_valid,tokenized_sentences_test = tc.tokenize_list_of_sentences([sentences_train,sentences_valid,sentences_test]) tokenized_sentences_train = [tc.preprocessor.rm_hyperlinks(s) for s in tokenized_sentences_train] tokenized_sentences_valid = [tc.preprocessor.rm_hyperlinks(s) for s in tokenized_sentences_valid] tokenized_sentences_test = [tc.preprocessor.rm_hyperlinks(s) for s in tokenized_sentences_test] tc.create_word2id([tokenized_sentences_train,tokenized_sentences_valid,tokenized_sentences_test]) with open(tc.cfg.fp + 'tc_words_dict.p','wb') as f: pickle.dump(tc.word2id, f) sequences_train = tc.tokenized_sentences2seq(tokenized_sentences_train, tc.word2id) #sequences_test = tc.tokenized_sentences2seq(tokenized_sentences_test, tc.words_dict) if cfg.use_saved_embedding_matrix: with open(tc.cfg.fp + 'embedding_word_dict.p','rb') as f: embedding_word_dict = pickle.load(f) embedding_matrix = np.load(tc.cfg.fp + 'embedding.npy') id_to_embedded_word = dict((id, word) for word, id in embedding_word_dict.items()) else: embedding_matrix, embedding_word_dict, id_to_embedded_word = tc.prepare_embeddings(tc.word2id) coverage(tokenized_sentences_train,embedding_word_dict) with open(tc.cfg.fp + 'embedding_word_dict.p','wb') as f: pickle.dump(embedding_word_dict,f) np.save(tc.cfg.fp + 'embedding.npy',embedding_matrix) train_list_of_token_ids = tc.convert_tokens_to_ids(sequences_train, embedding_word_dict) #test_list_of_token_ids = tc.convert_tokens_to_ids(sequences_test, embedding_word_dict) X = np.array(train_list_of_token_ids) #X_test = np.array(test_list_of_token_ids) X_test = None else: tc.preprocessor.min_count_chars = tc.cfg.min_count_chars tc.preprocessor.create_char_vocabulary(sentences_train) with open(tc.cfg.fp + 'char2index.p','wb') as f: pickle.dump(tc.preprocessor.char2index,f) X = tc.preprocessor.char2seq(sentences_train, maxlen=tc.cfg.max_seq_len) embedding_matrix = np.zeros((tc.preprocessor.char_vocab_size, tc.cfg.char_embedding_size)) X_test = None fold_size = len(X) // 10 for fold_id in range(0, fold_count): fold_start = fold_size * fold_id fold_end = fold_start + fold_size if fold_id == fold_size - 1: fold_end = len(X) X_valid = X[fold_start:fold_end] Y_valid = Y[fold_start:fold_end] X_train = np.concatenate([X[:fold_start], X[fold_end:]]) Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]]) #X_train, Y_train = mixup( X_train, Y_train,0.5, 0.1, seed=43) m = Model(Config) m.set_graph(embedding_matrix) m.train(X_train, Y_train, X_valid, Y_valid, X_test, embedding_matrix, fold_id)
with open(fn_words_dict, 'rb') as f: words_dict = pickle.load(f) with open(fn_embedding_words_dict, 'rb') as f: embedding_word_dict = pickle.load(f) embedding_matrix = np.load(root + model + 'embedding.npy') tc.id2word = dict((id, word) for word, id in words_dict.items()) test_data = None valid_data = None if TEST_DATA_FN is not None: test_data = pd.read_csv(TEST_DATA_FN) if cfg.do_preprocess: print('preprocessing test') test_data = preprocess(test_data) if cfg.level == 'word': X_test = transform_data(test_data) else: train_data = pd.read_csv(TRAIN_DATA_FN) if cfg.do_preprocess: print('preprocessing train') train_data = preprocess(train_data) preprocessor = Preprocessor(min_count_chars=10) sentences_train = train_data["comment_text"].fillna("_NAN_").values # sentences_train = [preprocessor.lower(text) for text in sentences_train] preprocessor.create_char_vocabulary(sentences_train) embedding_matrix = np.zeros( (preprocessor.char_vocab_size, cfg.char_embedding_size)) sentences_test = test_data["comment_text"].fillna("_NAN_").values
train_data = pd.read_csv(TRAIN_DATA_FN) if cfg.level == 'word': with open(fn_words_dict, 'rb') as f: words_dict = pickle.load(f) with open(fn_embedding_words_dict, 'rb') as f: embedding_word_dict = pickle.load(f) embedding_matrix = np.load(model_fp + 'embedding.npy') tc.id2word = dict((id, word) for word, id in words_dict.items()) X = transform_data(train_data) if cfg.level == 'char': if cfg.do_preprocess: train_data = preprocess(train_data) sentences_train = train_data["comment_text"].fillna("_NAN_").values sentences_train = [tc.preprocessor.lower(text) for text in sentences_train] tc.preprocessor.min_count_chars = 10 tc.preprocessor.create_char_vocabulary(sentences_train) X = tc.preprocessor.char2seq(sentences_train, maxlen=cfg.max_seq_len) embedding_matrix = np.zeros( (tc.preprocessor.char_vocab_size, tc.cfg.char_embedding_size)) Y = train_data[LIST_CLASSES].values def predict(epoch, X): tf.reset_default_graph() num_batches = len(X) // cfg.bsize + 1 bsize_last_batch = len(X) % (cfg.bsize * (num_batches - 1))
def encode(image): image = preprocess(image) temp_enc = image_model_transfer.predict(image) temp_enc = np.reshape(temp_enc, temp_enc.shape[1]) return temp_enc
import pickle from utilities import loadGloveModel, coverage unknown_word = "_UNK_" end_word = "_END_" nan_word = "_NAN_" list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] results = pd.DataFrame(columns=['fold_id','epoch','roc_auc_v','roc_auc_t','cost_val']) do_light_preprocessing = True train_data = pd.read_csv("assets/raw_data/train.csv") test_data = pd.read_csv("assets/raw_data/test.csv") if do_light_preprocessing: train_data = preprocess(train_data) test_data = preprocess(test_data) sentences_train = train_data["comment_text"].fillna("_NAN_").values sentences_test = test_data["comment_text"].fillna("_NAN_").values class Config: max_sentence_len = 500 do_augmentation_with_translate = False do_augmentation_with_mixup = False do_synthezize_embeddings = False mode_embeddings = 'fasttext_300d' if do_synthezize_embeddings: