def analyse(text): text = mojimoji.han_to_zen(text) sentences = ssplit(text) sents = [] for sentence in sentences: sentlen = len(sentence) if sentlen > 100: # slide a 100 char window to limit input text length sents.extend([ sentence[i * 100:i * 100 + 100] for i in range(sentlen // 100 + 1) ]) else: sents.append(sentence) analysed_text = model.predict(sents) xml = mednerj2xml("\n".join(analysed_text)) return xml
def analyze(self, source: Union[Path, str], knp_dir: Optional[str] = None) -> Tuple[list, PASDataset]: if isinstance(source, Path): self.logger.info(f'read knp files from {source}') save_dir = source else: save_dir = Path(knp_dir) if knp_dir is not None else Path( 'log') / datetime.now().strftime(r'%m%d_%H%M%S') save_dir.mkdir(exist_ok=True, parents=True) sents = [self.sanitize_string(sent) for sent in ssplit(source)] self.logger.info('input: ' + ''.join(sents)) knp_out = '' for i, sent in enumerate(sents): knp_out_ = self._apply_knp(sent) knp_out_ = knp_out_.replace('# S-ID:1', f'# S-ID:0-{i + 1}') knp_out += knp_out_ with save_dir.joinpath(f'0.knp').open(mode='wt') as f: f.write(knp_out) return self._analysis(save_dir)
ls = [row for row in reader] # r = ls[4] # print(r) # sentences = ssplit(ls[4][0]) # print('step1 : {}'.format(sentences)) # for i in range(len(l)): # print('step1-{} : {}'.format(i,l[i][0])) # sentences = ssplit(l[i][0]) result_ls = [] for l in ls: # print('step1:[{}]'.format(l[0])) sentences = ssplit(l[0]) result_strs = '' for sentence in sentences: result = jumanpp.analysis(sentence) # print('step2 : {}'.format(result)) result_str = '' for mrph in result.mrph_list(): # print('[{}]'.format(mrph.midasi)) # print(mrph.${attribute}) if result_str == '': # result_str = '{}'.format(mrph.midasi) result_str = '{}'.format(mrph.midasi) # print('s1[{}]'.format(result_str)) else:
def test_ssplit(test_case): text, sentences = test_case assert ssplit(text) == sentences
def extract_brat_from_json(json_file, brat_file, corpus, rid_col, pid_col, date_col, type_col, ann_col, sent_split=False): import mojimoji with open(json_file) as json_fi: json_dict = json.load(json_fi) char_toks, tags, attrs = [], [], [] char_offset, tag_offset, attr_offset = 0, 1, 1 prev_delimiter_flag = None for line_id, instance in json_dict['読影所見'].items(): ''' comment line: ## line id: 1 ||| 表示順: 1 ||| 匿名ID: 3276171 ||| タイトル: S ||| 記載日: 2014-03-20 ''' line_id = int(line_id) comment_items = [f"line id: {line_id}"] patient_id = str(instance[pid_col]).strip() report_id = str(instance[rid_col]) curr_delimiter_flag = report_id comment_items.append(f"表示順: {curr_delimiter_flag}") # print(line_id, report_id) if not prev_delimiter_flag: prev_delimiter_flag = curr_delimiter_flag if tags and curr_delimiter_flag != prev_delimiter_flag: out_rid = f"表示順{prev_delimiter_flag}" with open(f'{brat_file}.{out_rid}.txt', 'w') as fot: fot.write('%s' % (''.join(char_toks))) with open(f'{brat_file}.{out_rid}.ann', 'w') as foa: for tid, ttype, char_b, char_e, t in tags: foa.write('%s\t%s %s %s\t%s\n' % ( tid, tag2name[ttype], char_b, char_e, t )) for aid, key, tid, value in attrs: if key != 'tid': foa.write('%s\t%s %s %s\n' % ( aid, key, tid, value )) print('Converted json to brat, 表示順: %s processed.' % prev_delimiter_flag) # reset caches char_toks, tags, attrs = [], [], [] char_offset, tag_offset, attr_offset = 0, 1, 1 prev_delimiter_flag = curr_delimiter_flag if ann_col not in instance: continue finding = instance[ann_col] finding = fix_finding_str(finding) comment_items.append(f"匿名ID: {patient_id}") if type_col in instance: if instance[type_col].strip() in ['I']: continue comment_items.append("タイトル: %s" % instance['タイトル'].strip()) comment_items.append(f"記載日: {str(instance[date_col]).split('T')[0]}") head_line = "## %s" % ' ||| '.join(comment_items) if sent_split: xml_str = split_sent_to_xml(finding, head_line) else: if corpus in ['ou', 'ncc']: finding = '\n'.join(ssplit(mojimoji.zen_to_han(finding, kana=False))) xml_str = '<doc>\n' + \ (f'<line>{head_line}</line>\n' if corpus in ['mr'] else '') + \ '\n'.join([f'<line>{line.strip()}</line>' for line in finding.split('\n')]) + '\n</doc>\n' xml_str = fix_xml_str(xml_str) tmp_char_toks, tmp_tags, tmp_attrs = [], [], [] tmp_char_offset, tmp_tag_offset, tmp_attr_offset = char_offset, tag_offset, attr_offset try: root = ET.ElementTree(ET.fromstring(xml_str)).getroot() for sent_node in root.iter('line'): for tag in sent_node.iter(): if tag.text: char_seg = list(tag.text) tmp_char_toks += char_seg if tag.tag != 'line': tmp_tags.append(( 'T%i' % tmp_tag_offset, tag.tag, tmp_char_offset, tmp_char_offset + len(char_seg), tag.text )) if tag.attrib: for key, value in tag.attrib.items(): tmp_attrs.append(( 'A%i' % tmp_attr_offset, key, 'T%i' % tmp_tag_offset, value )) tmp_attr_offset += 1 tmp_tag_offset += 1 tmp_char_offset += len(char_seg) if tag.tag != 'line' and tag.tail: char_seg = list(tag.tail) tmp_char_toks += char_seg tmp_char_offset += len(char_seg) if len(tmp_char_toks) > 1: if not (tmp_char_toks[-1] == '\n'): tmp_char_toks += ['\n'] tmp_char_offset += 1 else: tmp_char_toks += ['\n'] tmp_char_offset += 1 char_toks += tmp_char_toks tags += tmp_tags attrs += tmp_attrs char_offset, tag_offset, attr_offset = tmp_char_offset, tmp_tag_offset, tmp_attr_offset for tid, ttype, char_b, char_e, t in tmp_tags: assert ''.join([char_toks[i] for i in range(char_b, char_e)]) == t except Exception as ex: print(f'[ERROR] line number:{line_id}, rid: {report_id}') print(ex) print(xml_str) print() print(tmp_char_toks) print() for tid, ttype, char_b, char_e, t in tmp_tags: print(char_b, char_e, ''.join([char_toks[i] for i in range(char_b, char_e)]), t) print() if line_id == len(json_dict['読影所見']) and char_toks: out_rid = f"表示順{curr_delimiter_flag}" with open(f'{brat_file}.{out_rid}.txt', 'w') as fot: fot.write('%s' % (''.join(char_toks))) with open(f'{brat_file}.{out_rid}.ann', 'w') as foa: for tid, ttype, char_b, char_e, t in tags: foa.write('%s\t%s %s %s\t%s\n' % ( tid, tag2name[ttype], char_b, char_e, t )) for aid, key, tid, value in attrs: if key != 'tid': foa.write('%s\t%s %s %s\n' % ( aid, key, tid, value )) print('Converted json to brat, 表示順: %s processed.' % prev_delimiter_flag)
def convert_examples_to_features_text_only(examples, context_dict, option_num, max_seq_length): """ Converts examples to features. Image features are not included. Parameters ---------- examples: [InputExample] Input examples context_dict: {str: str} Dict of contexts article_dict: {str: str} Dict of article names and image directories option_num: int Number of options max_seq_length: int Max length of input sequence to BERT Returns ---------- input_ids: torch.tensor Input ids for BERT input attention_mask: torch.tensor Attention mask for BERT input token_type_ids: torch.tensor Token type IDs for BERT input labels: torch.tensor Labels for BERT input """ input_ids = torch.empty(len(examples), option_num, max_seq_length, dtype=torch.long) attention_mask = torch.empty(len(examples), option_num, max_seq_length, dtype=torch.long) token_type_ids = torch.empty(len(examples), option_num, max_seq_length, dtype=torch.long) labels = torch.empty(len(examples), dtype=torch.long) for example_index, example in enumerate(tqdm(examples)): #Process every option. for i, ending in enumerate(example.endings): #Text features text_a = example.question + "[SEP]" + ending text_b = context_dict[ending] result_a = juman.analysis(text_a) tokenized_text_a = [mrph.midasi for mrph in result_a.mrph_list()] #Context is too long to analyze with Juman++. #Split the text first. text_b = ssplit(text_b) #Analyze the context. text_b_midasis = [] for sentence in text_b: #Convert hankaku characters to zenkaku ones. sentence = jaconv.h2z(sentence, digit=True, ascii=True) result_sentence = juman.analysis(sentence) for mrph in result_sentence.mrph_list(): text_b_midasis.append(mrph.midasi) #Create a list for the tokenized text. tokenized_text_b = [] for midasi in text_b_midasis: tokenized_text_b.append(miadsi) tokenized_text_b.insert(0, "[CLS]") tokenized_text_b.append("[SEP]") #Create input ids. input_ids_text_a = tokenizer.convert_tokens_to_ids( tokenized_text_a) input_ids_text_a_length = len(input_ids_text_a) input_ids_text_a = torch.tensor(input_ids_text_a) input_ids_text_b = tokenizer.convert_tokens_to_ids( tokenized_text_b) input_ids_text_b_length = len(input_ids_text_b) input_ids_text_b = torch.tensor(input_ids_text_b) #Concatenate two input ids and make an input ids. #Truncate the input ids if it is longer than the max_seq_length. input_ids_tmp = torch.cat([input_ids_text_a, input_ids_text_b], dim=0) input_ids_tmp = input_ids_tmp[:max_seq_length] #Create token type ids. #0 for text_a and 1 for text_b. token_type_ids_tmp = torch.zeros(max_seq_length, dtype=torch.long) for j in range(input_ids_text_a_length, max_seq_length): token_type_ids_tmp[j] = 1 #1 for real tokens and 0 for padding attention_mask_tmp = torch.ones(max_seq_length, dtype=torch.long) input_ids[example_index, i] = input_ids_tmp token_type_ids[example_index, i] = token_type_ids_tmp attention_mask[example_index, i] = attention_mask_tmp labels[example_index] = example.label return input_ids, attention_mask, token_type_ids, labels