def read_train_data(args, tokenizer, logger): if args.debug: args.train_batch_size = 8 train_path = os.path.join(args.data_dir, args.train_file) train_set = read_absa_data(train_path) train_examples = convert_absa_data(dataset=train_set, verbose_logging=args.verbose_logging) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.verbose_logging, logger) num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) logger.info("Num orig examples = %d", len(train_examples)) logger.info("Num split features = %d", len(train_features)) logger.info("Batch size = %d", args.train_batch_size) logger.info("Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_span_starts = torch.tensor([f.start_indexes for f in train_features], dtype=torch.long) all_span_ends = torch.tensor([f.end_indexes for f in train_features], dtype=torch.long) all_labels = torch.tensor([f.polarity_labels for f in train_features], dtype=torch.long) all_label_masks = torch.tensor([f.label_masks for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends, all_labels, all_label_masks) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) return train_dataloader, num_train_steps
def read_eval_data(args, tokenizer, logger): if args.debug: args.predict_batch_size = 8 eval_path = os.path.join(args.data_dir, args.predict_file) eval_set = read_absa_data(eval_path) eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.verbose_logging, logger) logger.info("Num orig examples = %d", len(eval_examples)) logger.info("Num split features = %d", len(eval_features)) logger.info("Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_span_starts = torch.tensor([f.start_indexes for f in eval_features], dtype=torch.long) all_span_ends = torch.tensor([f.end_indexes for f in eval_features], dtype=torch.long) all_label_masks = torch.tensor([f.label_masks for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends, all_label_masks, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) return eval_examples, eval_features, eval_dataloader
def pipeline_eval_data(args, tokenizer, logger): if args.debug: args.predict_batch_size = 8 eval_path = os.path.join(args.data_dir, args.predict_file) eval_set = read_absa_data(eval_path) eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.verbose_logging, logger) assert args.extraction_file is not None eval_extract_preds = [] extract_predictions = pickle.load(open(args.extraction_file, 'rb')) extract_dict = {} for pred in extract_predictions: extract_dict[pred.unique_id] = pred for eval_feature in eval_features: eval_extract_preds.append(extract_dict[eval_feature.unique_id]) assert len(eval_extract_preds) == len(eval_features) logger.info("Num orig examples = %d", len(eval_examples)) logger.info("Num split features = %d", len(eval_features)) logger.info("Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_span_starts = torch.tensor( [f.start_indexes for f in eval_extract_preds], dtype=torch.long) all_span_ends = torch.tensor([f.end_indexes for f in eval_extract_preds], dtype=torch.long) all_label_masks = torch.tensor([f.span_masks for f in eval_extract_preds], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends, all_label_masks, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) return eval_examples, eval_features, eval_dataloader