def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) device = torch.device('cuda') model_config = transformers.RobertaConfig.from_pretrained( config.MODEL_CONFIG) model_config.output_hidden_states = True model = models.TweetModel(conf=model_config) model.to(device) model.load_state_dict(torch.load( f'{config.TRAINED_MODEL_PATH}/model_{fold}.bin')) model.eval() valid_dataset = dataset.TweetDataset( tweets=df_valid.text.values, sentiments=df_valid.sentiment.values, selected_texts=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4, shuffle=False) jaccard = eval_fn(valid_data_loader, model, device) return jaccard
def store_tweets_to_db(keyword, tweets): # TODO: store tweets to db print_msg("Storing tweets to database of keyword %s..." % (keyword)) sk = models.SearchKeyword.get_keyword(keyword) if not sk: sk = models.SearchKeyword(keyword) for tweet in tweets: # print(tweet.get('id_str')) s_tweet = models.TweetModel.get_tweet(tweet.get('id_str')) if not s_tweet: tweetm = models.TweetModel(tweet) tweetm.keywords.append(sk) tweetm.save() else: s_tweet.update(sk) print_msg("Done!")
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = dataset.TweetDataset( tweets=df_train.text.values, sentiments=df_train.sentiment.values, selected_texts=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, shuffle=True) valid_dataset = dataset.TweetDataset( tweets=df_valid.text.values, sentiments=df_valid.sentiment.values, selected_texts=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4, shuffle=False) device = torch.device('cuda') model_config = transformers.RobertaConfig.from_pretrained( config.MODEL_CONFIG) model_config.output_hidden_states = True model = models.TweetModel(conf=model_config) model = model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.WEIGHT_DECAY }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] base_opt = transformers.AdamW(optimizer_parameters, lr=config.LEARNING_RATE) optimizer = torchcontrib.optim.SWA(base_opt, swa_start=int(num_train_steps * config.SWA_RATIO), swa_freq=config.SWA_FREQ, swa_lr=None) scheduler = transformers.get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=int(num_train_steps * config.WARMUP_RATIO), num_training_steps=num_train_steps) print(f'Training is starting for fold={fold}') for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) if config.USE_SWA: optimizer.swap_swa_sgd() torch.save(model.state_dict(), f'{config.MODEL_SAVE_PATH}/model_{fold}.bin') return jaccard
def run(): df_test = pd.read_csv(config.TEST_FILE) df_test.loc[:, 'selected_text'] = df_test.text.values device = torch.device('cuda') model_config = transformers.RobertaConfig.from_pretrained( config.MODEL_CONFIG) model_config.output_hidden_states = True fold_models = [] for i in range(config.N_FOLDS): model = models.TweetModel(conf=model_config) model.to(device) model.load_state_dict( torch.load(f'{config.TRAINED_MODEL_PATH}/model_{i}.bin')) model.eval() fold_models.append(model) test_dataset = dataset.TweetDataset( tweets=df_test.text.values, sentiments=df_test.sentiment.values, selected_texts=df_test.selected_text.values) data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=4) char_pred_test_start = [] char_pred_test_end = [] with torch.no_grad(): tk0 = tqdm.tqdm(data_loader, total=len(data_loader)) for bi, d in enumerate(tk0): ids = d['ids'] token_type_ids = d['token_type_ids'] mask = d['mask'] orig_tweet = d['orig_tweet'] offsets = d['offsets'] ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) outputs_start_folds = [] outputs_end_folds = [] for i in range(config.N_FOLDS): outputs_start, outputs_end = \ fold_models[i](ids=ids, mask=mask, token_type_ids=token_type_ids) outputs_start_folds.append(outputs_start) outputs_end_folds.append(outputs_end) outputs_start = sum(outputs_start_folds) / config.N_FOLDS outputs_end = sum(outputs_end_folds) / config.N_FOLDS outputs_start = torch.softmax(outputs_start, dim=-1).cpu().detach().numpy() outputs_end = torch.softmax(outputs_end, dim=-1).cpu().detach().numpy() for px, tweet in enumerate(orig_tweet): char_pred_test_start.append( utils.token_level_to_char_level(tweet, offsets[px], outputs_start[px])) char_pred_test_end.append( utils.token_level_to_char_level(tweet, offsets[px], outputs_end[px])) with open('roberta-char_pred_test_start.pkl', 'wb') as handle: pickle.dump(char_pred_test_start, handle) with open('roberta-char_pred_test_end.pkl', 'wb') as handle: pickle.dump(char_pred_test_end, handle)