def __init__(self, dataset_file_path: str, event_type_vocabulary: Vocabulary): """ 初始化 ACE Event Dataset :param dataset_file_path: 数据集的文件路基 """ super().__init__() self._ace_dataset = ACEDataset(dataset_file_path=dataset_file_path) self._instances: List[Instance] = list() for ori_instance in self._ace_dataset: ori_event_types = ori_instance["event_types"] ori_event_type_set = None if ori_event_types is not None: # 实际预测的时候 ori_event_types is None # 针对 training 和 validation 设置,因为 对于 pair<sentence, unk>, label = 1 ori_event_type_set = set(ori_event_types) if len(ori_event_type_set) == 0: ori_event_type_set.add(event_type_vocabulary.unk) for index in range(event_type_vocabulary.size): # 遍历所有的label, 形成 pair<句子,事件类型>,作为样本 event_type = event_type_vocabulary.token(index) instance = Instance() instance["sentence"] = ori_instance["sentence"] instance["entity_tag"] = ori_instance["entity_tag"] instance["event_type"] = event_type instance["metadata"] = ori_instance["metadata"] if ori_event_type_set is not None: if event_type in ori_event_type_set: instance["label"] = 1 else: instance["label"] = 0 else: # 是针对实际的 prediction 设置的 pass self._instances.append(instance)
def __init__(self, event_type_vocabulary: Vocabulary): """ 初始化 :param event_type_vocabulary: event type vocabulary """ super().__init__() self._event_type_f1: Dict[str, LabelF1Metric] = dict() for index in range(0, event_type_vocabulary.size): event_type = event_type_vocabulary.token(index) if event_type != event_type_vocabulary.unk: self._event_type_f1[event_type] = LabelF1Metric( labels=[1], label_vocabulary=None) self._event_type_f1[EventF1MetricAdapter.__OVERALL] = LabelF1Metric( labels=[1], label_vocabulary=None) self._event_type_vocabulary = event_type_vocabulary