def prep_for_training(num_train_optimization_steps, _config):
    tokenizer = BertTokenizer.from_pretrained(
        _config["bert_model"], do_lower_case=_config["do_lower_case"])

    # TODO:Change model here
    model = MultimodalBertForSequenceClassification.multimodal_from_pretrained(
        _config["bert_model"],
        newly_added_config=_config,
        cache_dir=_config["cache_dir"],
        num_labels=_config["num_labels"])

    model.to(_config["device"])

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=_config["learning_rate"],
                         warmup=_config["warmup_proportion"],
                         t_total=num_train_optimization_steps)

    return model, optimizer, tokenizer
model = BertModel.from_pretrained(_config["bert_model"])


with open(os.path.join(_config["dataset_location"],'humor_splitdata_sdk.pkl'), 'rb') as handle:
        all_data = pickle.load(handle)
        train_data = all_data["train"]
        dev_data= all_data["dev"]
        test_data=all_data["test"]
        


#humor_data=train_data+dev_data+test_data
humor_data=train_data

tokenizer = BertTokenizer.from_pretrained(_config["bert_model"], do_lower_case=_config["do_lower_case"])
output_mode = _config["output_mode"]
humor_dataset=get_appropriate_dataset(humor_data,tokenizer, output_mode,_config)

humor_dataloader = DataLoader(humor_dataset, batch_size=1,shuffle=_config["shuffle"], num_workers=_config["num_workers"])

humor_bert_embeddings=[]

model.eval()
with torch.no_grad():
        for step, batch in enumerate(tqdm(humor_dataloader, desc="Iteration")):
            batch = tuple(t.to(_config["device"]) for t in batch)
           
            input_ids, visual,acoustic,input_mask, segment_ids, label_ids = batch  
            
            encoder_output, pooled_output=model(input_ids.view(-1,_config["max_seq_length"]), segment_ids.view(-1,_config["max_seq_length"]), input_mask.view(-1,_config["max_seq_length"]), output_all_encoded_layers=False)
def set_up_data_loader(_config):

    # #MUST remove it

    # train_examples = None
    # num_train_optimization_steps = None
    # if args.do_train:
    #     train_examples = processor.get_train_examples(args.data_dir)
    #     #print("Train examples:",train_examples)
    #     #assert False
    #     num_train_optimization_steps = int(
    #         len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
    #     if args.local_rank != -1:
    #         num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    with open(os.path.join(_config["dataset_location"], 'all_mod_data.pickle'),
              'rb') as handle:
        all_data = pickle.load(handle)
    train_data = all_data["train"]
    dev_data = all_data["dev"]
    test_data = all_data["test"]

    if (_config["prototype"]):
        train_data = train_data[:100]
        dev_data = dev_data[:100]
        test_data = test_data[:100]

    tokenizer = BertTokenizer.from_pretrained(
        _config["bert_model"], do_lower_case=_config["do_lower_case"])
    output_mode = _config["output_mode"]

    train_dataset = get_appropriate_dataset(train_data, tokenizer, output_mode,
                                            _config)
    dev_dataset = get_appropriate_dataset(dev_data, tokenizer, output_mode,
                                          _config)
    test_dataset = get_appropriate_dataset(test_data, tokenizer, output_mode,
                                           _config)

    #print("train_dataset:",train_dataset)
    #print(len(train_dataset),_config["train_batch_size"],_config["gradient_accumulation_steps"], _config["num_train_epochs"])
    num_train_optimization_steps = int(
        len(train_dataset) / _config["train_batch_size"] /
        _config["gradient_accumulation_steps"]) * _config["num_train_epochs"]
    #print("num_tr_opt_st:",num_train_optimization_steps)

    #print("Train len:",len(train_dataset)," dev:",len(dev_dataset)," test:",len(test_dataset))

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=_config["train_batch_size"],
                                  shuffle=_config["shuffle"],
                                  num_workers=_config["num_workers"])

    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=_config["dev_batch_size"],
                                shuffle=_config["shuffle"],
                                num_workers=_config["num_workers"])

    test_dataloader = DataLoader(test_dataset,
                                 batch_size=_config["test_batch_size"],
                                 shuffle=_config["shuffle"],
                                 num_workers=_config["num_workers"])

    #print(train_X.shape,train_Y.shape,dev_X.shape,dev_Y.shape,test_X.shape,test_Y.shape)
    #data_loader = test_data_loader(train_X,train_Y,_config)
    return train_dataloader, dev_dataloader, test_dataloader, num_train_optimization_steps
def set_up_data_loader(_config):

    dataset_id_file = os.path.join(_config["dataset_location"],
                                   "revised_id_list.pkl")
    dataset_id = load_pickle(dataset_id_file)
    train = dataset_id['train']
    dev = dataset_id['dev']
    test = dataset_id['test']
    #print("real sizes:",len(train),len(dev),len(test))
    if (_config["prototype"]):
        train_num = _config["prot_train"]
        dev_num = _config["prot_dev"]
        test_num = _config["prot_test"]
        #dev=dataset_id['train']

        train = train[:train_num]
        dev = dev[:dev_num]
        test = test[:test_num]
        #print("train:",train)
        #print("dev:",dev)

    data_path = _config["dataset_location"]
    facet_file = os.path.join(data_path, 'revised_facet.pkl')
    covarep_file = os.path.join(data_path, "covarep.pkl")
    word_vec_file = os.path.join(data_path, "glove_index.pkl")
    y_labels = os.path.join(data_path, "video_labels.pkl")
    id_2_word_file = os.path.join(data_path, "ets_word_list.pkl")

    word_aligned_facet_sdk = load_pickle(facet_file)
    word_aligned_covarep_sdk = load_pickle(covarep_file)
    word_embedding_idx_sdk = load_pickle(word_vec_file)
    y_labels_sdk = load_pickle(y_labels)
    id_2_word = load_pickle(id_2_word_file)['data']
    #print(id_2_word)
    all_data = (word_aligned_facet_sdk, word_aligned_covarep_sdk,
                word_embedding_idx_sdk, y_labels_sdk, id_2_word)
    tokenizer = BertTokenizer.from_pretrained(
        _config["bert_model"], do_lower_case=_config["do_lower_case"])

    training_set = ETSDataset(train, _config, all_data, tokenizer)
    dev_set = ETSDataset(dev, _config, all_data, tokenizer)
    test_set = ETSDataset(test, _config, all_data, tokenizer)

    #print("dataset init")
    #print("In train dataloader:",_config["train_batch_size"])
    train_dataloader = DataLoader(training_set,
                                  batch_size=_config["train_batch_size"],
                                  shuffle=_config["shuffle"],
                                  num_workers=_config["num_workers"])

    dev_dataloader = DataLoader(dev_set,
                                batch_size=_config["dev_batch_size"],
                                shuffle=_config["shuffle"],
                                num_workers=_config["num_workers"])

    test_dataloader = DataLoader(test_set,
                                 batch_size=_config["test_batch_size"],
                                 shuffle=_config["shuffle"],
                                 num_workers=_config["num_workers"])
    num_train_optimization_steps = int(
        len(training_set) / _config["train_batch_size"] /
        _config["gradient_accumulation_steps"]) * _config["num_train_epochs"]

    print("num_t:{0}".format(num_train_optimization_steps))

    #print("data loader prepared")
    #my_logger.debug(train_X.shape,train_Y.shape,dev_X.shape,dev_Y.shape,test_X.shape,test_Y.shape)
    #data_loader = test_data_loader(train_X,train_Y,_config)
    return train_dataloader, dev_dataloader, test_dataloader, num_train_optimization_steps

    # with open(os.path.join(_config["dataset_location"],'all_mod_data.pickle'), 'rb') as handle:
    #     all_data = pickle.load(handle)
    # train_data = all_data["train"]
    # dev_data=all_data["dev"]
    # test_data=all_data["test"]

    # if(_config["prototype"]):
    #     train_data=train_data[:100]
    #     dev_data=dev_data[:100]
    #     test_data=test_data[:100]

    tokenizer = BertTokenizer.from_pretrained(
        _config["bert_model"], do_lower_case=_config["do_lower_case"])
    output_mode = _config["output_mode"]

    train_dataset = get_appropriate_dataset(train_data, tokenizer, output_mode,
                                            _config)
    dev_dataset = get_appropriate_dataset(dev_data, tokenizer, output_mode,
                                          _config)
    test_dataset = get_appropriate_dataset(test_data, tokenizer, output_mode,
                                           _config)

    #print("train_dataset:",train_dataset)
    #print(len(train_dataset),_config["train_batch_size"],_config["gradient_accumulation_steps"], _config["num_train_epochs"])
    num_train_optimization_steps = int(
        len(train_dataset) / _config["train_batch_size"] /
        _config["gradient_accumulation_steps"]) * _config["num_train_epochs"]
    #print("num_tr_opt_st:",num_train_optimization_steps)

    #print("Train len:",len(train_dataset)," dev:",len(dev_dataset)," test:",len(test_dataset))

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=_config["train_batch_size"],
                                  shuffle=_config["shuffle"],
                                  num_workers=_config["num_workers"])

    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=_config["dev_batch_size"],
                                shuffle=_config["shuffle"],
                                num_workers=_config["num_workers"])

    test_dataloader = DataLoader(test_dataset,
                                 batch_size=_config["test_batch_size"],
                                 shuffle=_config["shuffle"],
                                 num_workers=_config["num_workers"])

    #print(train_X.shape,train_Y.shape,dev_X.shape,dev_Y.shape,test_X.shape,test_Y.shape)
    #data_loader = test_data_loader(train_X,train_Y,_config)
    return train_dataloader, dev_dataloader, test_dataloader, num_train_optimization_steps