Example #1
0
def update_tag_scheme(sentences, tag_scheme, removeTag=None):
    #{{{
    """
    Check and update sentences tagging scheme to IOB2.
    Only IOB1 and IOB2 schemes are accepted.
    """
    for i, s in enumerate(sentences):
        tags = [w[-1] for w in s]
        # Check that tags are given in the IOB format
        if not iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            raise Exception('Sentences should be given in IOB format! ' +
                            'Please check sentence %i:\n%s' % (i, s_str))
        if tag_scheme == 'iob':
            # If format was IOB1, we convert to IOB2
            for word, new_tag in zip(s, tags):
                if removeTag is not None:
                    if new_tag[2:] in removeTag:
                        word[-1] = 'O'
                    else:
                        word[-1] = new_tag
        elif tag_scheme == 'iobes':
            new_tags = iob_iobes(tags)
            for word, new_tag in zip(s, new_tags):
                word[-1] = new_tag
        else:
            raise Exception('Unknown tagging scheme!')
Example #2
0
def update_tag_scheme(sentences, tag_scheme):
    """
    Check and update sentences tagging scheme to IOB2.
    Only IOB1 and IOB2 schemes are accepted.
    If tag scheme is generic, return without modifications
    """
    if tag_scheme == 'generic':
        return
    for i, s in enumerate(sentences):
        tags = [w[-1] for w in s]
        # Check that tags are given in the IOB format
        if not iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            raise Exception('Sentences should be given in IOB format! ' +
                            'Please check sentence %i:\n%s' % (i, s_str))
        if tag_scheme == 'iob':
            # If format was IOB1, we convert to IOB2
            for word, new_tag in zip(s, tags):
                word[-1] = new_tag
        elif tag_scheme == 'iobes':
            new_tags = iob_iobes(tags)
            for word, new_tag in zip(s, new_tags):
                word[-1] = new_tag
        else:
            raise Exception('Unknown tagging scheme!')
Example #3
0
def iob2obes(fn, outfn):
    sen = []
    sen_tokens = []
    data = []
    data_tokens = []
    for line in open(fn):
        if not line.rstrip():
            sen = iob_iobes(sen)
            data.append(sen)
            for i in range(len(sen)):
                sen_tokens[i][1] = sen[i]
            data_tokens.append(sen_tokens)
            sen = []
            sen_tokens = []
            continue

        tokens = line.rstrip().split()
        label = tokens[1]
        sen.append(label)
        sen_tokens.append(tokens)

    fid = open(outfn, 'w')
    for sen in data_tokens:
        for tokens in sen:
            fid.write(' '.join(tokens) + '\n')
        fid.write('\n')
Example #4
0
 def test_CRFtag_to_SCRFtag(self):
     tags = [
         'O', 'O', 'I-LOC', 'I-LOC', 'O', 'O', 'I-PER', 'I-PER', 'I-PER',
         'O', 'I-PER'
     ]
     result = utils.iob_iobes(tags)
     result = utils.CRFtag_to_SCRFtag([result])
     print result
Example #5
0
 def test_iob_iobes(self):
     print '\n\niob_iobes:'
     tags = [
         'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
         'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
         u'B-ORGANIZATION:CORPORATION', u'I-ORGANIZATION:CORPORATION',
         u'I-ORGANIZATION:CORPORATION', u'E-ORGANIZATION:CORPORATION', 'O',
         'O', 'O', 'O', 'O', 'O', 'O', 'O'
     ]
     print 'original:\n', tags
     result = utils.iob_iobes(tags)
     print 'new:\n', result
Example #6
0
def update_tag_scheme(sentences, tag_scheme):
    "tag_scheme: 'iob' or 'iobes'"
    for i, s in enumerate(sentences):
        tags = [w[-1] for w in s]
        # iob2 function:
        #   1. Check that tags are given in the IOB format
        #   2. Modify error tagging
        if not utils.iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            raise Exception('Sentences should be given in IOB format! ' +
                            'Please check sentence %i:\n%s' % (i, s_str))
        if tag_scheme.lower() == 'iob':
            # If format was IOB1, we convert to IOB2
            for word, new_tag in zip(s, tags):
                word[-1] = new_tag
        elif tag_scheme.lower() == 'iobes':
            new_tags = utils.iob_iobes(tags)
            for word, new_tag in zip(s, new_tags):
                word[-1] = new_tag
        else:
            raise Exception('Unknown tagging scheme!')
Example #7
0
def update_tag_scheme(sentences, tag_scheme, file_format="conll"):
    """
    Check and update sentences tagging scheme to IOB2.
    Only IOB1 and IOB2 schemes are accepted.
    """
    for i, s in enumerate(sentences):
        tags = []
        if file_format == "conll":
            tags = [w[-1] for w in s]
        elif file_format == "conllu":
            if contains_golden_label(s[0], "NER_TAG"):
                tags = [extract_correct_ner_tag_from_conllu(w) for w in s]
            else:
                continue
        # Check that tags are given in the IOB format
        if not iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            print(s_str.encode("utf8"))
            raise Exception('Sentences should be given in IOB format! ' +
                            'Please check sentence %i:\n%s' % (i, s_str))
        if tag_scheme == 'iob':
            # If format was IOB1, we convert to IOB2
            for word, new_tag in zip(s, tags):
                if file_format == "conll":
                    word[-1] = new_tag
                elif file_format == "conllu":
                    field_contents_dict = load_MISC_column_contents(word[9])
                    field_contents_dict["NER_TAG"] = new_tag
                    word[9] = compile_MISC_column_contents(field_contents_dict)
        elif tag_scheme == 'iobes':
            new_tags = iob_iobes(tags)
            for word, new_tag in zip(s, new_tags):
                if file_format == "conll":
                    word[-1] = new_tag
                elif file_format == "conllu":
                    field_contents_dict = load_MISC_column_contents(word[9])
                    field_contents_dict["NER_TAG"] = new_tag
                    word[9] = compile_MISC_column_contents(field_contents_dict)
        else:
            raise Exception('Unknown tagging scheme!')
Example #8
0
def update_tag_scheme(sentences,tag_scheme):
    """
    Check and update sentences tagging scheme to IOB2.
    Only IOB1 and IOB2 schemes are accepted.
    """
    new_sentences=[]
    for i,sentence in enumerate(sentences):
        tags=[word[-1] for word in sentence]
        # check whether tagging scheme is IOB format or not
        new_tags = iob2(tags)
        if not new_tags:
            error_str='\n'.join([' '.join(word) for word in sentence])
            raise Exception("Sentence should be given in IOB format! "
                            "Please check sentence %i \n %s") % (i+1,error_str)
        # convert tagging scheme
        if tag_scheme=='iob':
            pass
        elif tag_scheme=='iobes':
            new_tags=iob_iobes(new_tags)
        else:
            raise Exception('Unknown tag scheme!')
        new_sentences.append([[word[0],tag] for word,tag in zip(sentence,new_tags)])
    return new_sentences
Example #9
0
def update_tag_scheme(sentences, tag_scheme):
    """
    Check and update sentences tagging scheme to IOB2.
    Only IOB1 and IOB2 schemes are accepted.
    """
    for i, s in enumerate(sentences):
        tags = [w[-1] for w in s]
        # Check that tags are given in the IOB format
        if not iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            # raise Exception('Sentences should be given in IOB format! ' + 'Please check sentence %i:\n%s' % (i, s_str))
            print('Removing Problematic sentence: %i:\n%s' % (i, s_str))
            continue
        if tag_scheme == 'iob':
            # If format was IOB1, we convert to IOB2
            for word, new_tag in zip(s, tags):
                word[-1] = new_tag
        elif tag_scheme == 'iobes':
            new_tags = iob_iobes(tags)
            for word, new_tag in zip(s, new_tags):
                word[-1] = new_tag
        else:
            raise Exception('Unknown tagging scheme!')
Example #10
0
def batch_yield(data,
                batch_size,
                vocab,
                tag2label,
                shuffle=False,
                iob2iobes=True):
    """

    :param data:list [<class 'tuple'>: (['19421', '21215', '14459', '12052', '7731', '3028', '17622', '11664', '13751', '10841', '11255', '159', '8467', '15671', '2699', '13751', '11806', '14459', '15274'], ['B-b', 'I-b', 'I-b', 'O', 'O', 'B-b', 'I-b', 'O', 'O', 'O', 'O', 'O', 'B-b', 'B-b', 'O', 'O', 'O', 'O', 'O']),...]
    :param batch_size:
    :param vocab: word2id 字典
    :param tag2label: 见data处
    :param shuffle:
    :return: train_data <class 'list'>: [[4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437, 4437],...]
            label <class 'list'>: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],...]
    """
    if shuffle:
        random.shuffle(data)
    seqs, labels = [], []
    for (sent_, tag_) in data:
        sent_ = sentence2id(sent_, vocab)
        if 'E-PER.NOM' in tag_:
            print(tag_)
        if iob2iobes:
            tag_ = iob_iobes(tag_)
        label_ = [tag2label[tag] for tag in tag_]

        if len(seqs) == batch_size:
            yield seqs, labels
            seqs, labels = [], []

        seqs.append(sent_)
        labels.append(label_)

    if len(seqs) != 0:
        yield seqs, labels