def compute_bert_formatted_inputs(data_folder, max_seq_length=128): tokenizer, _, _, _ = load_bert(lower_case=True, base_version=True) # Generate formatted input for dataset_type in ['train', 'test']: with open(Path(data_folder, f'splitted_{dataset_type}_sentences.json'), 'r') as f: dataset = json.load(f) # Just convert examples to InputExamples splitted_dataset = _process_examples(dataset) # Convert to BERT input # examples_features is a list of list. The outer list holds examples (documents), the inner list holds one feature object per sentence examples_features = _convert_examples_to_features( examples=splitted_dataset, seq_length=max_seq_length, tokenizer=tokenizer) print( f'Len of parsed examples is {len(examples_features)}, just to double check' ) with open( Path(data_folder, f'formatted_{dataset_type}_dataset.pickle'), 'wb') as w: pickle.dump(examples_features, w)
def compute_bert_formatted_inputs(data_folder, max_seq_length=512): tokenizer, _, _, _ = load_bert(lower_case=True, base_version=True) # Generate formatted input for dataset_type in ['train', 'validation', 'test']: with open( Path(data_folder, f'spouse_{dataset_type}_set_finetune.json'), 'r') as f: dataset = json.load(f) # Just convert examples to InputExamples splitted_dataset = _process_examples(dataset) # Convert to BERT input examples_features = _convert_examples_to_features( examples=splitted_dataset, seq_length=max_seq_length, tokenizer=tokenizer) print( f'Len of parsed examples is {len(examples_features)}, just to double check' ) with open( Path( data_folder, f'spouse_formatted_{dataset_type}_finetune_dataset.pickle' ), 'wb') as w: pickle.dump(examples_features, w)
processed_example[key] = torch.cat((processed_example[key], val), dim=0) starting_token_idx += no_tokens processed_example['tokens'].extend(sentence_tokens) store_path = processed_path if not os.path.exists(store_path): os.makedirs(store_path) torch.save(processed_example, Path(store_path, f'{example_filename[:-6]}_processed.torch')) if example_no % 1000 == 0: print(f'Processed {example_no} examples') if __name__ == '__main__': formatted_data_folder = '../../../data/MovieReview' processed_data_folder = '../../../data/MovieReview_FineTune/' # Load BERT model _, bert_model, device, embedding_dim = load_bert(lower_case=True, base_version=True) bert_moviereview_parser = MovieReviewBuilder(formatted_data_folder, processed_data_folder) # Computes embeddings for each example bert_moviereview_parser.compute_moviereview_embeddings(bert_model, device) # Processes examples for subsequent training bert_moviereview_parser.process_embeddings(embedding_dim)