def main(): fdir = config.data_dir train_data, _ = data_load.load_dataset(f'{fdir}/train/in.txt', f'{fdir}/train/out.txt') test_data, _ = data_load.load_dataset(f'{fdir}/test/in.txt', f'{fdir}/test/out.txt') print(cal_total_max_len([train_data, test_data]))
def init_dataset(seq_path, tag_path, word_to_ix, max_seq_len, batch_size): seqs, tags = load_dataset(seq_path, tag_path) seqs, masks, tags = create_dataset(seqs, tags, word_to_ix, max_seq_len, word_to_ix['[PAD]']) extended_attention_mask = create_attention_mask(masks) dataset = TensorDataset(seqs, extended_attention_mask, tags) return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def process_data(hp): tokenizer = create_tokenizer_from_hub_module(hp) train = load_dataset('./input_data/train.csv') test = load_dataset('./input_data/test.csv') # train = train.sample(5000) # test = test.sample(5000) sfolder = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True) for kfold_num, (train_idx, eval_idx) in enumerate( sfolder.split(train[hp.DATA_COLUMN], train[hp.polarity])): # Use the InputExample class from BERT's run_classifier code to create examples from the data train_InputExamples = train.loc[train_idx].apply( lambda x: run_classifier_custom.InputExample( guid=None, # Globally unique ID for # bookkeeping, # unused in this example text_a=x[hp.DATA_COLUMN], selected_text=x[hp.selected_text], text_b=x[hp.sentiment], sentiment=x[hp.polarity]), axis=1) eval_InputExamples = train.loc[eval_idx].apply( lambda x: run_classifier_custom.InputExample( guid=None, text_a=x[hp.DATA_COLUMN], selected_text=x[hp.selected_text], text_b=x[hp.sentiment], sentiment=x[hp.polarity]), axis=1) break # print(tokenizer.tokenize("This here's an example of using the BERT tokenizer")) # Convert our train and test features to InputFeatures that BERT understands. train_features = run_classifier_custom.convert_examples_to_features( train_InputExamples, hp.MAX_SEQ_LENGTH, tokenizer, is_predicting=False) eavl_features = run_classifier_custom.convert_examples_to_features( eval_InputExamples, hp.MAX_SEQ_LENGTH, tokenizer, is_predicting=False) return train_features, eavl_features
def process_test_data(hp): tokenizer = create_tokenizer_from_hub_module(hp) test = load_dataset('./input_data/test.csv') test_InputExamples = test.apply( lambda x: run_classifier_custom.InputExample(guid=None, text_a=x[hp.DATA_COLUMN], selected_text=None, text_b=x[hp.sentiment], sentiment=x[hp.polarity]), axis=1) test_features = run_classifier_custom.convert_examples_to_features( test_InputExamples, hp.MAX_SEQ_LENGTH, tokenizer, is_predicting=True) return test_features
"""Convert mask to attention mask. """ extended_attention_mask = raw_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask.float() def create_transformer_attention_mask(raw_mask: torch.Tensor) -> torch.Tensor: """Convert mask to transformer attention mask. """ return (1 - raw_mask).bool() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--dir", default='tensor_dataset', type=str) parser.add_argument("--max_len", default=32, type=int) args = parser.parse_args() seq_path = f'{config.data_dir}/train/in.txt' tag_path = f'{config.data_dir}/train/out.txt' vocab_path = f'{config.data_dir}/vocabs' max_seq_len = args.max_len word_to_ix = load_vocab(vocab_path) vocab_size = len(word_to_ix) seqs, tags = load_dataset(seq_path, tag_path) seqs, masks, tags = create_dataset(seqs, tags, word_to_ix, max_seq_len, word_to_ix['[PAD]']) save_dataset(seqs, masks, tags, args.dir)