Exemple #1
0
def analyse(text):
    text = mojimoji.han_to_zen(text)
    sentences = ssplit(text)
    sents = []
    for sentence in sentences:
        sentlen = len(sentence)
        if sentlen > 100:
            # slide a 100 char window to limit input text length
            sents.extend([
                sentence[i * 100:i * 100 + 100]
                for i in range(sentlen // 100 + 1)
            ])
        else:
            sents.append(sentence)
    analysed_text = model.predict(sents)
    xml = mednerj2xml("\n".join(analysed_text))
    return xml
Exemple #2
0
    def analyze(self,
                source: Union[Path, str],
                knp_dir: Optional[str] = None) -> Tuple[list, PASDataset]:
        if isinstance(source, Path):
            self.logger.info(f'read knp files from {source}')
            save_dir = source
        else:
            save_dir = Path(knp_dir) if knp_dir is not None else Path(
                'log') / datetime.now().strftime(r'%m%d_%H%M%S')
            save_dir.mkdir(exist_ok=True, parents=True)
            sents = [self.sanitize_string(sent) for sent in ssplit(source)]
            self.logger.info('input: ' + ''.join(sents))
            knp_out = ''
            for i, sent in enumerate(sents):
                knp_out_ = self._apply_knp(sent)
                knp_out_ = knp_out_.replace('# S-ID:1', f'# S-ID:0-{i + 1}')
                knp_out += knp_out_
            with save_dir.joinpath(f'0.knp').open(mode='wt') as f:
                f.write(knp_out)

        return self._analysis(save_dir)
Exemple #3
0
    ls = [row for row in reader]

    #    r = ls[4]
    #    print(r)
    #    sentences = ssplit(ls[4][0])
    #    print('step1 : {}'.format(sentences))

    #    for i in range(len(l)):
    #        print('step1-{} : {}'.format(i,l[i][0]))
    #        sentences = ssplit(l[i][0])

    result_ls = []

    for l in ls:
        #        print('step1:[{}]'.format(l[0]))
        sentences = ssplit(l[0])

        result_strs = ''
        for sentence in sentences:
            result = jumanpp.analysis(sentence)
            #            print('step2 : {}'.format(result))

            result_str = ''
            for mrph in result.mrph_list():
                #                print('[{}]'.format(mrph.midasi))
                #                print(mrph.${attribute})
                if result_str == '':
                    #                    result_str = '{}'.format(mrph.midasi)
                    result_str = '{}'.format(mrph.midasi)
#                    print('s1[{}]'.format(result_str))
                else:
Exemple #4
0
def test_ssplit(test_case):
    text, sentences = test_case
    assert ssplit(text) == sentences
def extract_brat_from_json(json_file, brat_file, corpus,
                           rid_col, pid_col, date_col, type_col, ann_col,
                           sent_split=False):
    import mojimoji

    with open(json_file) as json_fi:
        json_dict = json.load(json_fi)
        char_toks, tags, attrs = [], [], []
        char_offset, tag_offset, attr_offset = 0, 1, 1
        prev_delimiter_flag = None
        for line_id, instance in json_dict['読影所見'].items():

            '''
            comment line: ## line id: 1 ||| 表示順: 1 ||| 匿名ID: 3276171 ||| タイトル: S ||| 記載日: 2014-03-20
            '''
            line_id = int(line_id)
            comment_items = [f"line id: {line_id}"]

            patient_id = str(instance[pid_col]).strip()
            report_id = str(instance[rid_col])
            curr_delimiter_flag = report_id
            comment_items.append(f"表示順: {curr_delimiter_flag}")

            # print(line_id, report_id)

            if not prev_delimiter_flag:
                prev_delimiter_flag = curr_delimiter_flag

            if tags and curr_delimiter_flag != prev_delimiter_flag:

                out_rid = f"表示順{prev_delimiter_flag}"

                with open(f'{brat_file}.{out_rid}.txt', 'w') as fot:
                    fot.write('%s' % (''.join(char_toks)))

                with open(f'{brat_file}.{out_rid}.ann', 'w') as foa:
                    for tid, ttype, char_b, char_e, t in tags:
                        foa.write('%s\t%s %s %s\t%s\n' % (
                            tid,
                            tag2name[ttype],
                            char_b,
                            char_e,
                            t
                        ))

                    for aid, key, tid, value in attrs:

                        if key != 'tid':
                            foa.write('%s\t%s %s %s\n' % (
                                aid,
                                key,
                                tid,
                                value
                            ))
                print('Converted json to brat, 表示順: %s processed.' % prev_delimiter_flag)

                # reset caches
                char_toks, tags, attrs = [], [], []
                char_offset, tag_offset, attr_offset = 0, 1, 1
                prev_delimiter_flag = curr_delimiter_flag

            if ann_col not in instance:
                continue
            finding = instance[ann_col]
            finding = fix_finding_str(finding)

            comment_items.append(f"匿名ID: {patient_id}")

            if type_col in instance:
                if instance[type_col].strip() in ['I']:
                    continue
                comment_items.append("タイトル: %s" % instance['タイトル'].strip())

            comment_items.append(f"記載日: {str(instance[date_col]).split('T')[0]}")
            head_line = "## %s" % ' ||| '.join(comment_items)

            if sent_split:
                xml_str = split_sent_to_xml(finding, head_line)
            else:
                if corpus in ['ou', 'ncc']:
                    finding = '\n'.join(ssplit(mojimoji.zen_to_han(finding, kana=False)))
                xml_str = '<doc>\n' + \
                          (f'<line>{head_line}</line>\n' if corpus in ['mr'] else '') + \
                          '\n'.join([f'<line>{line.strip()}</line>' for line in finding.split('\n')]) + '\n</doc>\n'

            xml_str = fix_xml_str(xml_str)
            tmp_char_toks, tmp_tags, tmp_attrs = [], [], []
            tmp_char_offset, tmp_tag_offset, tmp_attr_offset = char_offset, tag_offset, attr_offset
            try:
                root = ET.ElementTree(ET.fromstring(xml_str)).getroot()
                for sent_node in root.iter('line'):
                    for tag in sent_node.iter():
                        if tag.text:
                            char_seg = list(tag.text)
                            tmp_char_toks += char_seg
                            if tag.tag != 'line':
                                tmp_tags.append((
                                    'T%i' % tmp_tag_offset,
                                    tag.tag,
                                    tmp_char_offset,
                                    tmp_char_offset + len(char_seg),
                                    tag.text
                                ))
                                if tag.attrib:
                                    for key, value in tag.attrib.items():
                                        tmp_attrs.append((
                                            'A%i' % tmp_attr_offset,
                                            key,
                                            'T%i' % tmp_tag_offset,
                                            value
                                        ))
                                        tmp_attr_offset += 1
                                tmp_tag_offset += 1
                            tmp_char_offset += len(char_seg)
                        if tag.tag != 'line' and tag.tail:
                            char_seg = list(tag.tail)
                            tmp_char_toks += char_seg
                            tmp_char_offset += len(char_seg)
                    if len(tmp_char_toks) > 1:
                        if not (tmp_char_toks[-1] == '\n'):
                            tmp_char_toks += ['\n']
                            tmp_char_offset += 1
                    else:
                        tmp_char_toks += ['\n']
                        tmp_char_offset += 1
                char_toks += tmp_char_toks
                tags += tmp_tags
                attrs += tmp_attrs
                char_offset, tag_offset, attr_offset = tmp_char_offset, tmp_tag_offset, tmp_attr_offset
                for tid, ttype, char_b, char_e, t in tmp_tags:
                    assert ''.join([char_toks[i] for i in range(char_b, char_e)]) == t

            except Exception as ex:
                print(f'[ERROR] line number:{line_id}, rid: {report_id}')
                print(ex)
                print(xml_str)
                print()
                print(tmp_char_toks)
                print()
                for tid, ttype, char_b, char_e, t in tmp_tags:
                    print(char_b, char_e, ''.join([char_toks[i] for i in range(char_b, char_e)]), t)
                print()

            if line_id == len(json_dict['読影所見']) and char_toks:
                out_rid = f"表示順{curr_delimiter_flag}"

                with open(f'{brat_file}.{out_rid}.txt', 'w') as fot:
                    fot.write('%s' % (''.join(char_toks)))

                with open(f'{brat_file}.{out_rid}.ann', 'w') as foa:
                    for tid, ttype, char_b, char_e, t in tags:
                        foa.write('%s\t%s %s %s\t%s\n' % (
                            tid,
                            tag2name[ttype],
                            char_b,
                            char_e,
                            t
                        ))

                    for aid, key, tid, value in attrs:
                        if key != 'tid':
                            foa.write('%s\t%s %s %s\n' % (
                                aid,
                                key,
                                tid,
                                value
                            ))
                print('Converted json to brat, 表示順: %s processed.' % prev_delimiter_flag)
def convert_examples_to_features_text_only(examples, context_dict, option_num,
                                           max_seq_length):
    """
    Converts examples to features.
    Image features are not included.

    Parameters
    ----------
    examples: [InputExample]
        Input examples
    context_dict: {str: str}
        Dict of contexts
    article_dict: {str: str}
        Dict of article names and image directories
    option_num: int
        Number of options
    max_seq_length: int
        Max length of input sequence to BERT

    Returns
    ----------
    input_ids: torch.tensor
        Input ids for BERT input
    attention_mask: torch.tensor
        Attention mask for BERT input
    token_type_ids: torch.tensor
        Token type IDs for BERT input
    labels: torch.tensor
        Labels for BERT input
    """
    input_ids = torch.empty(len(examples),
                            option_num,
                            max_seq_length,
                            dtype=torch.long)
    attention_mask = torch.empty(len(examples),
                                 option_num,
                                 max_seq_length,
                                 dtype=torch.long)
    token_type_ids = torch.empty(len(examples),
                                 option_num,
                                 max_seq_length,
                                 dtype=torch.long)
    labels = torch.empty(len(examples), dtype=torch.long)

    for example_index, example in enumerate(tqdm(examples)):
        #Process every option.
        for i, ending in enumerate(example.endings):
            #Text features
            text_a = example.question + "[SEP]" + ending
            text_b = context_dict[ending]

            result_a = juman.analysis(text_a)
            tokenized_text_a = [mrph.midasi for mrph in result_a.mrph_list()]

            #Context is too long to analyze with Juman++.
            #Split the text first.
            text_b = ssplit(text_b)

            #Analyze the context.
            text_b_midasis = []
            for sentence in text_b:
                #Convert hankaku characters to zenkaku ones.
                sentence = jaconv.h2z(sentence, digit=True, ascii=True)
                result_sentence = juman.analysis(sentence)

                for mrph in result_sentence.mrph_list():
                    text_b_midasis.append(mrph.midasi)

            #Create a list for the tokenized text.
            tokenized_text_b = []
            for midasi in text_b_midasis:
                tokenized_text_b.append(miadsi)
            tokenized_text_b.insert(0, "[CLS]")
            tokenized_text_b.append("[SEP]")

            #Create input ids.
            input_ids_text_a = tokenizer.convert_tokens_to_ids(
                tokenized_text_a)
            input_ids_text_a_length = len(input_ids_text_a)
            input_ids_text_a = torch.tensor(input_ids_text_a)

            input_ids_text_b = tokenizer.convert_tokens_to_ids(
                tokenized_text_b)
            input_ids_text_b_length = len(input_ids_text_b)
            input_ids_text_b = torch.tensor(input_ids_text_b)

            #Concatenate two input ids and make an input ids.
            #Truncate the input ids if it is longer than the max_seq_length.
            input_ids_tmp = torch.cat([input_ids_text_a, input_ids_text_b],
                                      dim=0)
            input_ids_tmp = input_ids_tmp[:max_seq_length]

            #Create token type ids.
            #0 for text_a and 1 for text_b.
            token_type_ids_tmp = torch.zeros(max_seq_length, dtype=torch.long)
            for j in range(input_ids_text_a_length, max_seq_length):
                token_type_ids_tmp[j] = 1

            #1 for real tokens and 0 for padding
            attention_mask_tmp = torch.ones(max_seq_length, dtype=torch.long)

            input_ids[example_index, i] = input_ids_tmp
            token_type_ids[example_index, i] = token_type_ids_tmp
            attention_mask[example_index, i] = attention_mask_tmp

        labels[example_index] = example.label

    return input_ids, attention_mask, token_type_ids, labels