def _get_data(self, label_identifier=None): """Original label_identifier is 'semeval2020', translate to 'semeval2021' If label_identifier is set, we translate here in _get_data, so that happens only when parsing the original dataset """ self.labels = read_labels_from_file(label_identifier) # File containing labelled data label_file = os.path.join(self.task_config['dir'], self.task_config['label_file']) with open(label_file, 'r') as f: lst = f.readlines() self.data = [] prev_article_id = -1 fragments = [] for l in lst: article_id, p_type, start, end = l.strip().split('\t') article_id = int(article_id) if article_id == prev_article_id: # FIXME: hardcoded exceptions for label_identifiers if label_identifier != 'semeval2020': for n_p_type in normalize_label(p_type): fragments.append(Fragment(int(start), int(end), n_p_type)) else: fragments.append(Fragment(int(start), int(end), p_type)) else: if prev_article_id != -1: # Add the previous article #print("APPENDING article ", prev_article_id) fragments.sort() # Sort on first tuple element, that is, 'start' self.data.append({ 'id': prev_article_id, 'article': self.read_article(prev_article_id), 'fragments': fragments }) # Prepare the new one prev_article_id = article_id fragments = [] if label_identifier != 'semeval2020': for n_p_type in normalize_label(p_type): fragments.append(Fragment(int(start), int(end), n_p_type)) else: fragments.append(Fragment(int(start), int(end), p_type)) if fragments != []: #print("Appending ", prev_article_id) self.data.append({ 'id': prev_article_id, 'article': self.read_article(prev_article_id), 'fragments': fragments }) print("Total fragments = ", sum([len(d['fragments']) for d in self.data])) return self.labels, self.data
def __init__(self, label_identifier=None): self.dir = dir self.data = [] self.labels = [] self.label_identifier = label_identifier if label_identifier is not None: self.labels = read_labels_from_file(label_identifier) self.config = pipeline_config.config #if task_name: # self.task_config = self.config['data'][task_name] self.read_articles = {}
def translate(labels: list, data: list) -> tuple: """Translate one set of labels to another""" new_data = [] for d in data: id = d['id'] article = d['article'] n_fragments = [] for start, end, p_type in d['fragments']: for n_p_type in normalize_label(p_type): n_fragments.append((start, end, n_p_type)) new_data.append({'id': id, 'article': article, 'fragments': n_fragments}) new_labels = read_labels_from_file(pipeline_config.config.pipeline_config['data']['labels']) return new_labels, new_data
def _get_data(self, label_identifier="semeval2021"): self.labels = read_labels_from_file(label_identifier) self.data = [] training_set_file = os.path.join(self.task_config['dir'], self.task_config['label_file']) file_list = [x.strip() for x in training_set_file.split(',')] for file_name in file_list: with open(file_name, 'r', encoding='utf8') as f: json_data = json.load(f) for item in json_data: id = item['id'] txt = item['text'] labels = item['labels'] # labels = [{start:, end:, technique:, text_fragment}, ...] # We skip that 'text_fragment' as it can be derived from text and start/end fragments = [] for frag in labels: if frag['text_fragment'].strip() == '': # Deal with a '\n' fragment by skipping it continue fragments.append(Fragment(frag['start'], frag['end'], frag['technique'])) fragments.sort() self.data.append({'id': id, 'article': txt, 'fragments': fragments}) return self.labels, self.data
def __init__(self, task_name='2021-task6-3'): super(DataLoader2021_task3, self).__init__(task_name=task_name) self.labels = read_labels_from_file("semeval2021_3") print("DATALOADER 2021 - 3", "+"*20)
def __init__(self): with open('dev.yaml', 'r') as f: self._config = yaml.load(f, Loader=yaml.SafeLoader) self.modelled_labels = read_labels_from_file("semeval2021")
def execute(self): def capitalize(s): return s[0].upper() + s[1:] scenario = self.config.get_scenario() print( f"******* starting pipeline {self.config['pipeline']['cur_scenario']} ******" ) for current_step in self.config.get_scenario(): print( f" =================== Current step: {current_step} =============================" ) self.config.set_current_pipeline_step(current_step) self.pipeline_config = self.config.pipeline_config logger.info('******** LOADING DATA *******') self.get_data_loader() # Training phase logger.info('******* TRAINING *******') print("get task type =========== ", self.config.get_task_type()) if not 'disable' in self.pipeline_config['train'].keys() or \ self.pipeline_config['train']['disable'] is not True: train_phase = self.pipeline_config['train'] train_module = __import__('train') converter_module = __import__('converter') converter_class = getattr( converter_module, capitalize(train_phase['converter']) + 'Converter') if self.config.get_task_type() == 'sequence': trainer_class_name = 'SpanTrainer' else: trainer_class_name = 'ClassificationTrainer' trainer_class = getattr(train_module, trainer_class_name) trainer = trainer_class(self.dataloader, converter_class()) trainer.train() else: logger.info(">>> training disabled") train_phase = self.pipeline_config['train'] train_module = __import__('train') converter_module = __import__('converter') converter_class = getattr( converter_module, capitalize(train_phase['converter']) + 'Converter') # Evaluation phase logger.info('******* EVALUATION *******') if not 'disable' in self.pipeline_config['eval'].keys() or \ self.pipeline_config['eval']['disable'] is not True: if self.config.get_task_type() == 'sequence': #evaluator = SequenceEvaluator(self.dataloader) evaluator = SpanEvaluator(self.dataloader) else: evaluator = ClassificationEvaluator(self.dataloader) evaluator.evaluate(converter_class()) # Create a score file for evaluation if self.config.get_task_type() != 'sequence': from score import score_task1 pred_file = self.config.get_output_data_dir( ) + 'predict1.txt' gold_file = self.config.get_output_data_dir() + 'true1.txt' score_task1(predict_file=pred_file, true_file=gold_file) # Run the scorer sys.path.append( os.path.realpath('SEMEVAL-2021-task6-corpus')) from scorer.task1_3 import evaluate, validate_files # (pred_fpath, gold_fpath, CLASSES): # from format_checker.task1_3 import validate_files CLASSES = read_labels_from_file( self.pipeline_config['data']['labels']) if validate_files(pred_file, gold_file, CLASSES): logger.info('Prediction file format is correct') macro_f1, micro_f1 = evaluate(pred_file, gold_file, CLASSES) logger.info("macro-F1={:.5f}\tmicro-F1={:.5f}".format( macro_f1, micro_f1)) else: print("Failed to validate prediction & gold files") else: print("No scoring for sequence type") else: print("Evaluation is disabled") # Post-processing phase if 'postprocess' in self.pipeline_config.keys() and \ self.pipeline_config['postprocess']['disable'] is not True: logger.info('******* POST-PROCESSING *******') postprocess_phase = self.pipeline_config['postprocess'] postprocess_module = __import__('postprocess') postprocess_class = getattr( postprocess_module, postprocess_phase['processor']['class']) postprocessor = postprocess_class( self.config.get_model_type(), self.config.get_output_dir(), **postprocess_phase[ 'processor'] # This should return a dictionary ) postprocessor.execute() else: print('No post-processing defined')