Beispiel #1
0
    def _get_data(self, label_identifier=None):
        """Original label_identifier is 'semeval2020', translate to 'semeval2021'
        If label_identifier is set, we translate here in _get_data, so that happens
        only when parsing the original dataset
        """
        self.labels = read_labels_from_file(label_identifier)

        # File containing labelled data
        label_file = os.path.join(self.task_config['dir'], self.task_config['label_file'])
        with open(label_file, 'r') as f:
            lst = f.readlines()

        self.data = []
        prev_article_id = -1
        fragments = []
        for l in lst:
            article_id, p_type, start, end = l.strip().split('\t')

            article_id = int(article_id)
            if article_id == prev_article_id:
                # FIXME: hardcoded exceptions for label_identifiers
                if label_identifier != 'semeval2020':
                    for n_p_type in normalize_label(p_type):
                        fragments.append(Fragment(int(start), int(end), n_p_type))
                else:
                    fragments.append(Fragment(int(start), int(end), p_type))
            else:
                if prev_article_id != -1:
                    # Add the previous article
                    #print("APPENDING article ", prev_article_id)
                    fragments.sort()  # Sort on first tuple element, that is, 'start'
                    self.data.append({
                        'id': prev_article_id,
                        'article': self.read_article(prev_article_id),
                        'fragments': fragments
                    })

                # Prepare the new one
                prev_article_id = article_id
                fragments = []
                if label_identifier != 'semeval2020':
                    for n_p_type in normalize_label(p_type):
                        fragments.append(Fragment(int(start), int(end), n_p_type))
                else:
                    fragments.append(Fragment(int(start), int(end), p_type))

        if fragments != []:
            #print("Appending ", prev_article_id)
            self.data.append({
                'id': prev_article_id,
                'article': self.read_article(prev_article_id),
                'fragments': fragments
            })
        print("Total fragments = ", sum([len(d['fragments']) for d in self.data]))
        return self.labels, self.data
Beispiel #2
0
 def __init__(self, label_identifier=None):
     self.dir = dir
     self.data = []
     self.labels = []
     self.label_identifier = label_identifier
     if label_identifier is not None:
         self.labels = read_labels_from_file(label_identifier)
     self.config = pipeline_config.config
     #if task_name:
     #    self.task_config = self.config['data'][task_name]
     self.read_articles = {}
Beispiel #3
0
def translate(labels: list, data: list) -> tuple:
    """Translate one set of labels to another"""
    new_data = []
    for d in data:
        id = d['id']
        article = d['article']
        n_fragments = []
        for start, end, p_type in d['fragments']:
            for n_p_type in normalize_label(p_type):
                n_fragments.append((start, end, n_p_type))
        new_data.append({'id': id, 'article': article, 'fragments': n_fragments})
    new_labels = read_labels_from_file(pipeline_config.config.pipeline_config['data']['labels'])
    return new_labels, new_data
Beispiel #4
0
    def _get_data(self, label_identifier="semeval2021"):
        self.labels = read_labels_from_file(label_identifier)
        self.data = []
        training_set_file = os.path.join(self.task_config['dir'], self.task_config['label_file'])
        file_list = [x.strip() for x in training_set_file.split(',')]
        for file_name in file_list:
            with open(file_name, 'r', encoding='utf8') as f:
                json_data = json.load(f)

            for item in json_data:
                id = item['id']
                txt = item['text']
                labels = item['labels']  # labels = [{start:, end:, technique:, text_fragment}, ...]
                # We skip that 'text_fragment' as it can be derived from text and start/end
                fragments = []
                for frag in labels:
                    if frag['text_fragment'].strip() == '':
                        # Deal with a '\n' fragment by skipping it
                        continue
                    fragments.append(Fragment(frag['start'], frag['end'], frag['technique']))
                fragments.sort()
                self.data.append({'id': id, 'article': txt, 'fragments': fragments})
        return self.labels, self.data
Beispiel #5
0
 def __init__(self, task_name='2021-task6-3'):
     super(DataLoader2021_task3, self).__init__(task_name=task_name)
     self.labels = read_labels_from_file("semeval2021_3")
     print("DATALOADER 2021 - 3", "+"*20)
 def __init__(self):
     with open('dev.yaml', 'r') as f:
         self._config = yaml.load(f, Loader=yaml.SafeLoader)
     self.modelled_labels = read_labels_from_file("semeval2021")
Beispiel #7
0
    def execute(self):
        def capitalize(s):
            return s[0].upper() + s[1:]

        scenario = self.config.get_scenario()
        print(
            f"******* starting pipeline {self.config['pipeline']['cur_scenario']} ******"
        )
        for current_step in self.config.get_scenario():
            print(
                f" =================== Current step: {current_step} ============================="
            )
            self.config.set_current_pipeline_step(current_step)
            self.pipeline_config = self.config.pipeline_config

            logger.info('******** LOADING DATA *******')
            self.get_data_loader()

            # Training phase
            logger.info('******* TRAINING *******')
            print("get task type =========== ", self.config.get_task_type())
            if not 'disable' in self.pipeline_config['train'].keys() or \
                    self.pipeline_config['train']['disable'] is not True:
                train_phase = self.pipeline_config['train']
                train_module = __import__('train')
                converter_module = __import__('converter')
                converter_class = getattr(
                    converter_module,
                    capitalize(train_phase['converter']) + 'Converter')
                if self.config.get_task_type() == 'sequence':
                    trainer_class_name = 'SpanTrainer'
                else:
                    trainer_class_name = 'ClassificationTrainer'
                trainer_class = getattr(train_module, trainer_class_name)
                trainer = trainer_class(self.dataloader, converter_class())
                trainer.train()
            else:
                logger.info(">>> training disabled")
                train_phase = self.pipeline_config['train']
                train_module = __import__('train')
                converter_module = __import__('converter')
                converter_class = getattr(
                    converter_module,
                    capitalize(train_phase['converter']) + 'Converter')

            # Evaluation phase
            logger.info('******* EVALUATION *******')
            if not 'disable' in self.pipeline_config['eval'].keys() or \
                    self.pipeline_config['eval']['disable'] is not True:

                if self.config.get_task_type() == 'sequence':
                    #evaluator = SequenceEvaluator(self.dataloader)
                    evaluator = SpanEvaluator(self.dataloader)
                else:
                    evaluator = ClassificationEvaluator(self.dataloader)

                evaluator.evaluate(converter_class())

                # Create a score file for evaluation
                if self.config.get_task_type() != 'sequence':
                    from score import score_task1
                    pred_file = self.config.get_output_data_dir(
                    ) + 'predict1.txt'
                    gold_file = self.config.get_output_data_dir() + 'true1.txt'
                    score_task1(predict_file=pred_file, true_file=gold_file)

                    # Run the scorer
                    sys.path.append(
                        os.path.realpath('SEMEVAL-2021-task6-corpus'))

                    from scorer.task1_3 import evaluate, validate_files  # (pred_fpath, gold_fpath, CLASSES):
                    # from format_checker.task1_3 import validate_files
                    CLASSES = read_labels_from_file(
                        self.pipeline_config['data']['labels'])

                    if validate_files(pred_file, gold_file, CLASSES):
                        logger.info('Prediction file format is correct')
                        macro_f1, micro_f1 = evaluate(pred_file, gold_file,
                                                      CLASSES)
                        logger.info("macro-F1={:.5f}\tmicro-F1={:.5f}".format(
                            macro_f1, micro_f1))
                    else:
                        print("Failed to validate prediction & gold files")

                else:
                    print("No scoring for sequence type")

            else:
                print("Evaluation is disabled")

            # Post-processing phase
            if 'postprocess' in self.pipeline_config.keys() and \
                    self.pipeline_config['postprocess']['disable'] is not True:
                logger.info('******* POST-PROCESSING *******')
                postprocess_phase = self.pipeline_config['postprocess']
                postprocess_module = __import__('postprocess')
                postprocess_class = getattr(
                    postprocess_module,
                    postprocess_phase['processor']['class'])
                postprocessor = postprocess_class(
                    self.config.get_model_type(),
                    self.config.get_output_dir(),
                    **postprocess_phase[
                        'processor']  # This should return a dictionary
                )
                postprocessor.execute()

            else:
                print('No post-processing defined')