Ejemplo n.º 1
0
    def __init__(self):
        self._instances: List[Instance] = list()

        max_num = 100
        for i in range(0, 50):
            instance1 = Instance()
            instance1["x"] = i

            instance2 = Instance()
            instance2["x"] = max_num - i

            self._instances.append(instance1)
            self._instances.append(instance2)
Ejemplo n.º 2
0
    def __init__(self, dataset_file_path: str):
        """
        初始化, 会将数据集转换成 instance
        :param dataset_file_path: 数据集文件路径
        """
        self._instances: List[Instance] = list()
        self._tokenizer = ZhTokenizer()

        with open(dataset_file_path, encoding="utf-8") as dataset_file:
            for line in dataset_file:
                line = line.strip()
                if len(line) == 0:
                    continue
                seg_tags = line.split()

                sentence = list()
                sequence_label = list()

                for seg_tag in seg_tags:
                    seg_tag_item = seg_tag.split("/")

                    assert len(
                        seg_tag_item) == 2, f"{seg_tag} 没有被分成2部分: {line}"

                    seg, tag = seg_tag_item

                    sentence.extend(seg)

                    if tag == MsraDataset.NONE:

                        bio_tags = ["O"] * len(seg)

                    elif tag == MsraDataset.PER:
                        bio_tags = ["I-PER"] * len(seg)
                        bio_tags[0] = "B-PER"

                    elif tag == MsraDataset.LOC:
                        bio_tags = ["I-LOC"] * len(seg)
                        bio_tags[0] = "B-LOC"
                    elif tag == MsraDataset.ORG:
                        bio_tags = ["I-ORG"] * len(seg)
                        bio_tags[0] = "B-ORG"
                    else:
                        raise RuntimeError(
                            f"tag: {tag} 是错误的,应该是 "
                            f"{MsraDataset.NONE}, {MsraDataset.PER}, "
                            f"{MsraDataset.LOC}, {MsraDataset.ORG}")
                    sequence_label.extend(bio_tags)

                sentence = "".join(sentence)
                instance = Instance()
                instance["tokens"] = self._tokenizer.tokenize(sentence)
                instance["sequence_label"] = sequence_label
                instance["metadata"] = {
                    "text": sentence,
                    "labels": sequence_label
                }

                self._instances.append(instance)
Ejemplo n.º 3
0
    def create_instance(self,
                        input_data: Dict) -> Union[Instance, List[Instance]]:
        instance = Instance()

        instance["context"] = "".join(input_data["context"].split())
        instance["query"] = input_data["query"]

        instance["entity_label"] = input_data.get("entity_label", None)
        instance["impossible"] = input_data.get("impossible", None)

        instance["start_positions"] = input_data.get("start_position", None)
        instance["end_positions"] = input_data.get("end_position", None)

        return instance
Ejemplo n.º 4
0
    def __init__(self, dataset_file_path: str):
        self._instances: List[Instance] = list()

        # 读取文件
        with open(dataset_file_path, encoding="utf-8") as f:
            content = "".join([line for line in f])

        # 使用 BeautifulSoup 解析
        soup = BeautifulSoup(content, "lxml")

        sentence_tags = soup.find_all('sentence')

        for sentence_tag in sentence_tags:
            # 提取 sentence
            sentence = sentence_tag.text.strip()

            # 提取 aspect term
            aspect_term_tags = sentence_tag.find_all('aspectterm')
            aspect_terms = []
            for aspect_term_tag in aspect_term_tags:
                term = aspect_term_tag['term'].strip()
                polarity = aspect_term_tag['polarity']
                from_index = int(aspect_term_tag['from'])
                to_index = int(aspect_term_tag['to'])
                aspect_term = {
                    "term": term,
                    "polarity": polarity,
                    "begin": from_index,
                    "end": to_index
                }

                aspect_terms.append(aspect_term)

            # 提取 aspect categories
            aspect_categories = []
            aspect_category_tags = sentence_tag.find_all('aspectcategory')

            for aspect_category_tag in aspect_category_tags:
                category = aspect_category_tag['category'].strip()
                polarity = aspect_category_tag['polarity'].strip()
                aspect_category = {"category": category, "polarity": polarity}
                aspect_categories.append(aspect_category)

            instance = Instance()
            instance["sentence"] = sentence
            instance["aspect_categories"] = aspect_categories
            instance["aspect_terms"] = aspect_terms

            self._instances.append(instance)
Ejemplo n.º 5
0
    def __init__(self, dataset_file_path: str):
        """
        初始化
        :param dataset_file_path: 数据集的文件路径
        """
        super().__init__()

        self._instances: List[Instance] = list()

        tokenizer = ZhTokenizer(is_remove_invalidate_char=False)

        logging.info(
            f"Begin read lattice ner demo dataset: {dataset_file_path}")

        with open(dataset_file_path, encoding="utf-8") as data_file:

            # 两个 分隔行 之间的是一个样本
            for is_divider, lines in itertools.groupby(
                    data_file, LatticeNerDemoDataset._is_divider):

                if not is_divider:

                    lines = [_ for _ in lines]
                    fields = [line.strip().split() for line in lines]

                    fields = [list(field) for field in zip(*fields)]
                    tokens_, bmes_labels = fields

                    text = "".join(tokens_)

                    # logging.debug(f"text: {text}")
                    tokens = tokenizer.tokenize(text)

                    assert len(tokens) == len(bmes_labels), \
                        f"token 长度: {len(tokens)} 与 标签长度: {len(bmes_labels)} 不匹配"

                    bio_labels = bmes_to_bio(bmes_labels)

                    instance = Instance()
                    instance["metadata"] = {
                        "text": text,
                        "sequence_label": bio_labels
                    }
                    instance["tokens"] = tokens
                    instance["sequence_label"] = bio_labels

                    self._instances.append(instance)
Ejemplo n.º 6
0
    def __init__(self, dataset_file_path: str,
                 event_type_vocabulary: Vocabulary):
        """
        初始化 ACE Event Dataset
        :param dataset_file_path: 数据集的文件路基
        """
        super().__init__()
        self._ace_dataset = ACEDataset(dataset_file_path=dataset_file_path)

        self._instances: List[Instance] = list()

        for ori_instance in self._ace_dataset:

            ori_event_types = ori_instance["event_types"]

            ori_event_type_set = None

            if ori_event_types is not None:  # 实际预测的时候 ori_event_types is None
                # 针对 training 和 validation 设置,因为 对于 pair<sentence, unk>, label = 1
                ori_event_type_set = set(ori_event_types)

                if len(ori_event_type_set) == 0:
                    ori_event_type_set.add(event_type_vocabulary.unk)

            for index in range(event_type_vocabulary.size):
                # 遍历所有的label, 形成 pair<句子,事件类型>,作为样本
                event_type = event_type_vocabulary.token(index)

                instance = Instance()

                instance["sentence"] = ori_instance["sentence"]

                instance["entity_tag"] = ori_instance["entity_tag"]

                instance["event_type"] = event_type
                instance["metadata"] = ori_instance["metadata"]

                if ori_event_type_set is not None:
                    if event_type in ori_event_type_set:
                        instance["label"] = 1
                    else:
                        instance["label"] = 0
                else:
                    # 是针对实际的 prediction 设置的
                    pass

                self._instances.append(instance)
Ejemplo n.º 7
0
    def text_to_instance(sentence: str, tokens: List[str],
                         entity_tags: List[str],
                         event_types: List[str]) -> Instance:

        instance = Instance()

        instance["sentence"] = [Token(t.lower()) for t in tokens]

        instance["entity_tag"] = entity_tags

        instance["event_types"] = event_types
        instance["metadata"] = {
            "sentence": sentence,
            "event_types": event_types
        }

        return instance
Ejemplo n.º 8
0
    def __init__(self, dataset_file_path: str):
        self._sem_eval_dataset = SemEvalDataset(
            dataset_file_path=dataset_file_path)

        self._instances: List[Instance] = list()

        for sem_eval_instance in self._sem_eval_dataset:

            sentence = sem_eval_instance["sentence"]
            aspect_categories = sem_eval_instance["aspect_categories"]

            for aspect_category in aspect_categories:
                instance = Instance()
                instance["sentence"] = sentence
                instance["category"] = aspect_category["category"]
                instance["label"] = aspect_category["polarity"]

                self._instances.append(instance)