pickle.dump(pad_token_dict, open(fine_label_path + "/pad_token_dict.pkl", "wb")) print("Pad token dict Dumped") temp_df = df[df.label.isin(children)].reset_index(drop=True) temp_coarse_lbls = [p] * len(temp_df.text.values) temp_coarse_label_to_index = {p: 0} coarse_input_ids, coarse_attention_masks, _ = gpt2_tokenize(coarse_tokenizer, temp_df.text.values, temp_coarse_lbls, pad_token_dict, temp_coarse_label_to_index) fine_input_ids, fine_attention_masks = gpt2_fine_tokenize(fine_tokenizer, temp_df, index_to_label, pad_token_dict) dataset = TensorDataset(coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks) train_dataloader, validation_dataloader = create_data_loaders(dataset, batch_size=1) label_to_exclusive_dataloader = {} for ch in children: child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(iteration) + "it/" + ch + ".pkl", "rb")) # for i in range(1, iteration + 1): # temp_child_df = pickle.load(open(pkl_dump_dir + "exclusive_" + str(i) + "it/" + ch + ".pkl", "rb")) # if i == 1: # child_df = temp_child_df # else: # child_df = pd.concat([child_df, temp_child_df]) temp_child_lbls = [ch] * len(child_df.text.values) child_exc_input_ids, child_exc_attention_masks = basic_gpt2_tokenize(fine_tokenizer, child_df.text.values, temp_child_lbls, pad_token_dict) child_exc_dataset = TensorDataset(child_exc_input_ids, child_exc_attention_masks) dataloader = DataLoader(
for ch in parent_to_child[p]: temp_df = pickle.load( open( pkl_dump_dir + "exclusive/" + algo + "/" + str(iteration) + "it/" + ch + ".pkl", "rb")) temp_df["label"] = [ch] * len(temp_df) if df_weaksup is None: df_weaksup = temp_df else: df_weaksup = pd.concat([df_weaksup, temp_df]) df = pd.concat([df, df_weaksup]) coarse_input_ids, coarse_attention_masks = basic_gpt2_tokenize( tokenizer, df.text.values, df.label.values, pad_token_dict) # Combine the training inputs into a TensorDataset. dataset = TensorDataset(coarse_input_ids, coarse_attention_masks) # Create a 90-10 train-validation split. coarse_train_dataloader, coarse_validation_dataloader = create_data_loaders( dataset, batch_size=4) model = train(model, tokenizer, coarse_train_dataloader, coarse_validation_dataloader, doc_start_ind, all_labels, device, pad_token_dict) test_generate(model, tokenizer, all_labels, pad_token_dict, device) tokenizer.save_pretrained(tok_path) torch.save(model, model_path + model_name) pickle.dump(pad_token_dict, open(pkl_dump_dir + "pad_token_dict.pkl", "wb"))