def update_tag_scheme(sentences, tag_scheme, removeTag=None): #{{{ """ Check and update sentences tagging scheme to IOB2. Only IOB1 and IOB2 schemes are accepted. """ for i, s in enumerate(sentences): tags = [w[-1] for w in s] # Check that tags are given in the IOB format if not iob2(tags): s_str = '\n'.join(' '.join(w) for w in s) raise Exception('Sentences should be given in IOB format! ' + 'Please check sentence %i:\n%s' % (i, s_str)) if tag_scheme == 'iob': # If format was IOB1, we convert to IOB2 for word, new_tag in zip(s, tags): if removeTag is not None: if new_tag[2:] in removeTag: word[-1] = 'O' else: word[-1] = new_tag elif tag_scheme == 'iobes': new_tags = iob_iobes(tags) for word, new_tag in zip(s, new_tags): word[-1] = new_tag else: raise Exception('Unknown tagging scheme!')
def update_tag_scheme(sentences, tag_scheme): """ Check and update sentences tagging scheme to IOB2. Only IOB1 and IOB2 schemes are accepted. If tag scheme is generic, return without modifications """ if tag_scheme == 'generic': return for i, s in enumerate(sentences): tags = [w[-1] for w in s] # Check that tags are given in the IOB format if not iob2(tags): s_str = '\n'.join(' '.join(w) for w in s) raise Exception('Sentences should be given in IOB format! ' + 'Please check sentence %i:\n%s' % (i, s_str)) if tag_scheme == 'iob': # If format was IOB1, we convert to IOB2 for word, new_tag in zip(s, tags): word[-1] = new_tag elif tag_scheme == 'iobes': new_tags = iob_iobes(tags) for word, new_tag in zip(s, new_tags): word[-1] = new_tag else: raise Exception('Unknown tagging scheme!')
def iob2obes(fn, outfn): sen = [] sen_tokens = [] data = [] data_tokens = [] for line in open(fn): if not line.rstrip(): sen = iob_iobes(sen) data.append(sen) for i in range(len(sen)): sen_tokens[i][1] = sen[i] data_tokens.append(sen_tokens) sen = [] sen_tokens = [] continue tokens = line.rstrip().split() label = tokens[1] sen.append(label) sen_tokens.append(tokens) fid = open(outfn, 'w') for sen in data_tokens: for tokens in sen: fid.write(' '.join(tokens) + '\n') fid.write('\n')
def test_CRFtag_to_SCRFtag(self): tags = [ 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'O', 'I-PER', 'I-PER', 'I-PER', 'O', 'I-PER' ] result = utils.iob_iobes(tags) result = utils.CRFtag_to_SCRFtag([result]) print result
def test_iob_iobes(self): print '\n\niob_iobes:' tags = [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', u'B-ORGANIZATION:CORPORATION', u'I-ORGANIZATION:CORPORATION', u'I-ORGANIZATION:CORPORATION', u'E-ORGANIZATION:CORPORATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ] print 'original:\n', tags result = utils.iob_iobes(tags) print 'new:\n', result
def update_tag_scheme(sentences, tag_scheme): "tag_scheme: 'iob' or 'iobes'" for i, s in enumerate(sentences): tags = [w[-1] for w in s] # iob2 function: # 1. Check that tags are given in the IOB format # 2. Modify error tagging if not utils.iob2(tags): s_str = '\n'.join(' '.join(w) for w in s) raise Exception('Sentences should be given in IOB format! ' + 'Please check sentence %i:\n%s' % (i, s_str)) if tag_scheme.lower() == 'iob': # If format was IOB1, we convert to IOB2 for word, new_tag in zip(s, tags): word[-1] = new_tag elif tag_scheme.lower() == 'iobes': new_tags = utils.iob_iobes(tags) for word, new_tag in zip(s, new_tags): word[-1] = new_tag else: raise Exception('Unknown tagging scheme!')
def update_tag_scheme(sentences, tag_scheme, file_format="conll"): """ Check and update sentences tagging scheme to IOB2. Only IOB1 and IOB2 schemes are accepted. """ for i, s in enumerate(sentences): tags = [] if file_format == "conll": tags = [w[-1] for w in s] elif file_format == "conllu": if contains_golden_label(s[0], "NER_TAG"): tags = [extract_correct_ner_tag_from_conllu(w) for w in s] else: continue # Check that tags are given in the IOB format if not iob2(tags): s_str = '\n'.join(' '.join(w) for w in s) print(s_str.encode("utf8")) raise Exception('Sentences should be given in IOB format! ' + 'Please check sentence %i:\n%s' % (i, s_str)) if tag_scheme == 'iob': # If format was IOB1, we convert to IOB2 for word, new_tag in zip(s, tags): if file_format == "conll": word[-1] = new_tag elif file_format == "conllu": field_contents_dict = load_MISC_column_contents(word[9]) field_contents_dict["NER_TAG"] = new_tag word[9] = compile_MISC_column_contents(field_contents_dict) elif tag_scheme == 'iobes': new_tags = iob_iobes(tags) for word, new_tag in zip(s, new_tags): if file_format == "conll": word[-1] = new_tag elif file_format == "conllu": field_contents_dict = load_MISC_column_contents(word[9]) field_contents_dict["NER_TAG"] = new_tag word[9] = compile_MISC_column_contents(field_contents_dict) else: raise Exception('Unknown tagging scheme!')
def update_tag_scheme(sentences,tag_scheme): """ Check and update sentences tagging scheme to IOB2. Only IOB1 and IOB2 schemes are accepted. """ new_sentences=[] for i,sentence in enumerate(sentences): tags=[word[-1] for word in sentence] # check whether tagging scheme is IOB format or not new_tags = iob2(tags) if not new_tags: error_str='\n'.join([' '.join(word) for word in sentence]) raise Exception("Sentence should be given in IOB format! " "Please check sentence %i \n %s") % (i+1,error_str) # convert tagging scheme if tag_scheme=='iob': pass elif tag_scheme=='iobes': new_tags=iob_iobes(new_tags) else: raise Exception('Unknown tag scheme!') new_sentences.append([[word[0],tag] for word,tag in zip(sentence,new_tags)]) return new_sentences
def update_tag_scheme(sentences, tag_scheme): """ Check and update sentences tagging scheme to IOB2. Only IOB1 and IOB2 schemes are accepted. """ for i, s in enumerate(sentences): tags = [w[-1] for w in s] # Check that tags are given in the IOB format if not iob2(tags): s_str = '\n'.join(' '.join(w) for w in s) # raise Exception('Sentences should be given in IOB format! ' + 'Please check sentence %i:\n%s' % (i, s_str)) print('Removing Problematic sentence: %i:\n%s' % (i, s_str)) continue if tag_scheme == 'iob': # If format was IOB1, we convert to IOB2 for word, new_tag in zip(s, tags): word[-1] = new_tag elif tag_scheme == 'iobes': new_tags = iob_iobes(tags) for word, new_tag in zip(s, new_tags): word[-1] = new_tag else: raise Exception('Unknown tagging scheme!')
def batch_yield(data, batch_size, vocab, tag2label, shuffle=False, iob2iobes=True): """ :param data:list [<class 'tuple'>: (['19421', '21215', '14459', '12052', '7731', '3028', '17622', '11664', '13751', '10841', '11255', '159', '8467', '15671', '2699', '13751', '11806', '14459', '15274'], ['B-b', 'I-b', 'I-b', 'O', 'O', 'B-b', 'I-b', 'O', 'O', 'O', 'O', 'O', 'B-b', 'B-b', 'O', 'O', 'O', 'O', 'O']),...] :param batch_size: :param vocab: word2id 字典 :param tag2label: 见data处 :param shuffle: :return: train_data <class 'list'>: [[4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437],...] label <class 'list'>: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],...] """ if shuffle: random.shuffle(data) seqs, labels = [], [] for (sent_, tag_) in data: sent_ = sentence2id(sent_, vocab) if 'E-PER.NOM' in tag_: print(tag_) if iob2iobes: tag_ = iob_iobes(tag_) label_ = [tag2label[tag] for tag in tag_] if len(seqs) == batch_size: yield seqs, labels seqs, labels = [], [] seqs.append(sent_) labels.append(label_) if len(seqs) != 0: yield seqs, labels