Exemple #1
0
def generateFullTagFile(full_tag_file, boundary4training, class4training,
                        sen_ent4developing, not4train, ebao_dic, datatype,
                        tag_strategy, texttype):
    f = open(full_tag_file, 'r')
    lines = f.readlines()
    f.close()

    fw_b = open(boundary4training, 'w')
    fw_c = open(class4training, 'w')
    if not4train == '1':
        fw_e = open(sen_ent4developing, 'w')
        sen_tags = []
        sentences = []
    for line in lines:
        line = line.replace('\n', '')  # 保留原句中的起始空格
        line = line.replace('\r', '')
        line = line.replace(' ', ' ')
        try:
            sentence, entities_in_sentence = generateSenEntities(
                line, texttype)  # 替换全角空格
        except Exception as e:
            print line
            print sentence
            continue

        # 过滤训练数据的ds中整个句子标为一个实体的例子
        if datatype == 'train':
            if len(entities_in_sentence) == 1 and entities_in_sentence[
                    0].content == sentence and entities_in_sentence[
                        0].type == 'specifications':
                continue

        # 增加符号替换及空格处理
        #(该部分操作重复,在分词的时候做了该处理,不过该操作在加字典特征的时候起到了作用)
        new_sentence, new_entities = symbolProcess(sentence,
                                                   entities_in_sentence)
        if len(new_entities) == 0: continue  # 类别评价

        feature_b, tags_in_sen = generateFeature.boundaryFeatureGeneration(
            new_sentence, new_entities, ebao_dic, 'full', tag_strategy)

        fw_b.write(feature_b)
        feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
            new_sentence, new_entities, ebao_dic, texttype)
        fw_c.write(feature_c)
        if not4train == '1':
            fw_e.write(sen_ent4error)
            sentences.append(new_sentence.replace('\r', ''))
            sen_tags.append(tags_in_sen)
    fw_b.close()
    fw_c.close()
    if not4train == '1':
        fw_e.close()
        print sen_ent4developing + 'generated!'
    print boundary4training + ' generated!'
    print class4training + ' generated!'
    if not4train == '1':
        return sentences, sen_tags
def selectActiveData(unselected, selected, model, num):
    select = []
    tagger_bp = crfsuite.Tagger()
    tagger_bp.open(model)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']
    entropy = []
    for line in unselected:
        sentence, entities_in_sentence = processing.generateSenEntities(
            line, '')
        new_sentence, new_entities = processing.symbolProcess(
            sentence, entities_in_sentence)
        sentence_unicode = new_sentence.decode('utf-8')
        tag_seq = processing.generateTagSeq(sentence_unicode, new_entities)
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            new_sentence, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % line
        xseq = crfsuite.ItemSequence()
        features = []
        for instance in instances:
            fields = instance.split('\t')
            features.append(fields[2:])
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_bp.set(xseq)

        yseq_b = tagger_bp.viterbi()
        length = len(yseq_b)

        yseq = []
        ie_entity = 0.0
        for i in range(length):
            yseq.append(yseq_b[i])

        for j in range(len(yseq)):
            ie = 0.0  # 信息熵
            for ent_tag in bieso:
                try:
                    tag_prob = tagger_bp.marginal(ent_tag, j)
                    ie += tag_prob * math.log(1 / tag_prob, 2)
                except Exception, e:
                    print line
                    exit(0)
            ie_entity += ie
        entropy.append((line, ie_entity))
Exemple #3
0
def generatePartialTagFile(partial_tag_file, ebao_dic, tag_strategy, sen_num):
    f = open(partial_tag_file, 'r')
    lines = f.readlines()
    f.close()

    predict_value_dic = {}
    sentence_num = 0
    for line in lines:
        if sentence_num > sen_num: break
        sentence_num += 1
        line = line.replace('\n', '')  # 保留原句中的起始空格
        line = line.replace('\r', '')
        line = line.replace(' ', ' ')
        try:
            sentence, entities_in_sentence = generateSenEntities(line)

            # 过滤实体:全英文、长度为1、实体类型不对http:
            if len(entities_in_sentence) == 0: continue

            feature_string, tags_in_sen = generateFeature.boundaryFeatureGeneration(
                sentence, entities_in_sentence, ebao_dic, 'partial',
                tag_strategy)  # S3tag\tS1tag\tFeatures
            predict_value = predictValue(feature_string)
        except Exception as e:
            print e
            print sentence
            continue
        if feature_string == None:
            print 'None: %s' % line
            continue
        predict_value_dic[sentence + '\t\t\t' +
                          feature_string.strip()] = predict_value

    # sort predict_value_dic
    sorted_predict_value_dic = sorted(predict_value_dic.iteritems(),
                                      key=lambda d: d[1],
                                      reverse=True)

    sample_feature_list = []

    for key_value in sorted_predict_value_dic:
        print str(key_value[1])
        sen_feature = key_value[0].split('\t\t\t')
        print sen_feature[0] + '\n'
        sample_feature_list.append(sen_feature[1])

    return sample_feature_list
Exemple #4
0
def generateBoundaryTagFile(source, target_file, ebao_dic, not4train,
                            sen_ent4developing):
    if isinstance(source, list):
        lines = source
    else:
        f = open(source, 'r')
        lines = f.readlines()
        f.close()

    fw_b = open(target_file, 'w')
    if not4train == '1':
        fw_e = open(sen_ent4developing, 'w')
        sen_tags = []
        sentences = []

    for line in lines:
        line = line.replace('\n', '')  # 保留原句中的起始空格
        line = line.replace('\r', '')
        line = line.replace(' ', ' ')
        try:
            sentence, entities_in_sentence = generateSenEntities(line,
                                                                 '')  # 替换全角空格
        except Exception as e:
            print line
            print sentence
            continue

        new_sentence, new_entities = symbolProcess(sentence,
                                                   entities_in_sentence)
        if len(new_entities) == 0: continue  # 类别评价
        feature_b, tags_in_sen = generateFeature.boundaryFeatureGeneration(
            new_sentence, new_entities, ebao_dic, 'full', '')
        fw_b.write(feature_b)
        if not4train == '1':
            feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
                new_sentence, new_entities, ebao_dic, '')
            fw_e.write(sen_ent4error)
            sentences.append(new_sentence.replace('\r', ''))
            sen_tags.append(tags_in_sen)
    fw_b.close()
    print target_file + ' generated!'
    if not4train == '1':
        fw_e.close()
        print sen_ent4developing + 'generated!'
        return sentences, sen_tags
Exemple #5
0
def mainfunction(sen, postProcess, texttype, index):
    model_b = os.path.join(root, './models/boundarymodel-' + index)
    model_c = os.path.join(root, './models/classmodel-' + index)

    ner_lines = ''

    tagger_b = crfsuite.Tagger()
    tagger_b.open(model_b)
    tagger_c = crfsuite.Tagger()
    tagger_c.open(model_c)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']

    line = sen.strip()

    # model_2_layer
    # boundary
    feature_string = ''
    instances = []
    feature_string, tags = generateFeature.boundaryFeatureGeneration(
        line, [], ebao_dic, 'demo', '0')
    try:
        instances = feature_string.strip().split('\n')
    except AttributeError as e:
        print 'feature_string:%s.' % feature_string
    xseq = crfsuite.ItemSequence()
    for instance in instances:
        fields = instance.split('\t')
        item = crfsuite.Item()
        for field in fields[2:]:
            item.append(crfsuite.Attribute(field))
        xseq.append(item)
    tagger_b.set(xseq)
    yseq_b = tagger_b.viterbi()

    line_unicode = line.decode('utf-8')

    model_chosen = '2layer'
    # class
    sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])

    length = len(sen_ent_list1[0])
    # length 为0时
    entities = []
    new_entities = []
    for j in range(length):
        ent_start = sen_ent_list1[0][j][0]
        ent_end = sen_ent_list1[0][j][1]
        ent_type = sen_ent_list1[0][j][2]
        ent_content = line_unicode[ent_start:ent_end].encode('utf-8')
        entities.append(Entity(ent_content, ent_start, ent_end, ent_type))
    feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
        line, entities, ebao_dic, texttype)
    instances = feature_c.strip().split('\n\n')
    ents_type = []
    for instance in instances:
        xseq = crfsuite.ItemSequence()
        fields = instance.split('\t')
        item = crfsuite.Item()
        for field in fields[1:]:
            item.append(crfsuite.Attribute(field))
        xseq.append(item)
        tagger_c.set(xseq)
        yseq_c = tagger_c.viterbi()
        ents_type.append(yseq_c[0])
    # postProcessing

    new_yseq = ['O' for i in range(len(line_unicode))]
    for j in range(len(entities)):
        start = entities[j].start_pos
        end = entities[j].end_pos
        enttype = ents_type[j]
        if start + 1 == end:
            new_yseq[start] = 'S-' + enttype
            continue
        new_yseq[start] = 'B-' + enttype
        for k in range(start + 1, end - 1):
            new_yseq[k] = 'I-' + enttype
        new_yseq[end - 1] = 'E-' + enttype

    if postProcess == '1':  # 评价中的start_end_list没有调整
        new_yseq = postProcessing.twoProcessings(line_unicode, new_yseq,
                                                 ebao_dic, texttype)

    ents1, s_e_list1 = evaluation.generateEntList([new_yseq])
    new_entities = ents1[0]

    entity_list = ''
    length = len(new_entities)
    for i in range(length):
        content = line_unicode[new_entities[i][0]:new_entities[i][1]]
        enttype = new_entities[i][2]
        if enttype == '':
            print line_unicode.encode('utf8'), line_unicode[
                new_entities[i][0]:new_entities[i][1]].encode('utf8')
        entity_list += content.encode(
            'utf8') + '[' + en_cn_dic[enttype] + ']\n'
    return entity_list, new_yseq
Exemple #6
0
def mainfunction(inputstring, taggerb, taggerc):
    if inputstring == '':
        sentence_ner = '请输入句子'
        return sentence_ner

    # 一些句子预处理
    inputsentence = tools.uniformSignal(inputstring)

    ner_lines = ''

    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']

    new_term_list = ''

    for single_line in inputsentence.split('\n'):
        lines = tools.sentence_split(single_line)
        ner_line = ''
        term_list = ''
        for line in lines:
            line = line.strip()
            # 去除标签部分,以<开头且以>结尾的过滤
            if line == '' or line[0] == '<' and line[-1] == '>': continue

            # model_2_layer
            # boundary
            feature_string = ''
            instances = []
            feature_string, tags = generateFeature.boundaryFeatureGeneration(
                line, [], ebao_dic, 'demo', '0')
            try:
                instances = feature_string.strip().split('\n')
            except AttributeError as e:
                print 'feature_string:%s.' % feature_string
            xseq = crfsuite.ItemSequence()
            for instance in instances:
                fields = instance.split('\t')
                item = crfsuite.Item()
                for field in fields[2:]:
                    item.append(crfsuite.Attribute(field))
                xseq.append(item)
            taggerb.set(xseq)

            yseq_b = taggerb.viterbi()
            prob_b = taggerb.probability(yseq_b)
            line_unicode = line.decode('utf-8')

            # for t, y in enumerate(yseq_b):
            # # Output the predicted labels with their marginal probabilities.
            #     ner_line  += '%s:%f\n' % (y, taggerb.marginal(y, t))

            model_chosen = '2layer'
            # class
            sen_ent_list1, start_end_list1 = evaluation.generateEntList(
                [yseq_b])

            length = len(sen_ent_list1[0])
            # length 为0时
            sentence = line
            entities = []
            for j in range(length):
                ent_start = sen_ent_list1[0][j][0]
                ent_end = sen_ent_list1[0][j][1]
                ent_type = sen_ent_list1[0][j][2]
                ent_content = sentence.decode(
                    'utf-8')[ent_start:ent_end].encode('utf-8')
                entities.append(
                    Entity(ent_content, ent_start, ent_end, ent_type))
            feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
                sentence, entities, ebao_dic, texttype)
            instances = feature_c.strip().split('\n\n')
            ents_type = []
            for instance in instances:
                xseq = crfsuite.ItemSequence()
                fields = instance.split('\t')
                item = crfsuite.Item()
                for field in fields[1:]:
                    item.append(crfsuite.Attribute(field))
                xseq.append(item)
                taggerc.set(xseq)
                yseq_c = taggerc.viterbi()
                ents_type.append(yseq_c[0])
            new_yseq = ['O' for i in range(len(line_unicode))]
            for j in range(len(entities)):
                start = entities[j].start_pos
                end = entities[j].end_pos
                if start + 1 == end:
                    new_yseq[start] = 'S-' + ents_type[j]
                    continue
                new_yseq[start] = 'B-' + ents_type[j]
                for k in range(start + 1, end - 1):
                    new_yseq[k] = 'I-' + ents_type[j]
                new_yseq[end - 1] = 'E-' + ents_type[j]

            sen_ent_colored, ent_list = generateNerInSentence(
                line_unicode, new_yseq, model_chosen, ebao_dic)

            new_term_list += ent_list

            if sen_ent_colored == '': sen_ent_colored = line
            # ner_lines += '<p>' + sen_ent_colored + '</p>'
            # ner_lines += '<p>' + ent_list + '</p>'
            ner_line += sen_ent_colored
            term_list += ent_list

        ner_lines += '<p>' + ner_line + '</p>'
        ner_lines += '<p>' + term_list + '</p>'
        ner_lines += '<br/>'

    return ner_lines, new_term_list
Exemple #7
0
def getNerResult(inputstring, tagger_b, tagger_c, bieso):
    # inputstring = unicode(inputstring)
    # inputsentence = tools.uniformSignal(inputstring.encode('utf8'))
    lines = tools.sentence_split(inputstring)

    ent_list = ''
    for line in lines:
        line = line.strip()
        # 去除标签部分,以<开头且以>结尾的过滤
        #if line == '' or line[0] == '<' and line[-1] == '>' : continue
        if line == '': continue

        # model_2_layer
        # boundary
        feature_string = ''
        instances = []
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            line, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % feature_string
        xseq = crfsuite.ItemSequence()
        for instance in instances:
            fields = instance.split('\t')
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_b.set(xseq)

        yseq_b = tagger_b.viterbi()
        prob_b = tagger_b.probability(yseq_b)
        line_unicode = line.decode('utf-8')

        model_chosen = '2layer'
        # class
        sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])

        length = len(sen_ent_list1[0])
        # length 为0时
        sentence = line
        entities = []
        for j in range(length):
            ent_start = sen_ent_list1[0][j][0]
            ent_end = sen_ent_list1[0][j][1]
            ent_type = sen_ent_list1[0][j][2]
            ent_content = sentence.decode('utf-8')[ent_start:ent_end].encode(
                'utf-8')
            entities.append(Entity(ent_content, ent_start, ent_end, ent_type))
        feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
            sentence, entities, ebao_dic, '')
        instances = feature_c.strip().split('\n\n')
        ents_type = []
        for instance in instances:
            xseq = crfsuite.ItemSequence()
            fields = instance.split('\t')
            item = crfsuite.Item()
            for field in fields[1:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
            tagger_c.set(xseq)
            yseq_c = tagger_c.viterbi()
            ents_type.append(yseq_c[0])
        new_yseq = ['O' for i in range(len(line_unicode))]
        for j in range(len(entities)):
            start = entities[j].start_pos
            end = entities[j].end_pos
            if start + 1 == end:
                new_yseq[start] = 'S-' + ents_type[j]
                continue
            new_yseq[start] = 'B-' + ents_type[j]
            for k in range(start + 1, end - 1):
                new_yseq[k] = 'I-' + ents_type[j]
            new_yseq[end - 1] = 'E-' + ents_type[j]

        ents = generateNerInSentence(line_unicode, new_yseq, model_chosen,
                                     ebao_dic)
        ent_list += ents
    return ent_list
Exemple #8
0
def semiSupervisedProcessing(model_previous, fsamples, ie_value, ebao_dic):
    tagger_bp = crfsuite.Tagger()
    tagger_bp.open(model_previous)
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']
    cdd4training_semi = []
    cdd4training_semi_number = 0
    for line in fsamples:
        # 用识别的实体过滤样例
        sentence, entities_in_sentence = generateSenEntities(line)
        new_sentence, new_entities = symbolProcess(sentence,
                                                   entities_in_sentence)
        sentence_unicode = new_sentence.decode('utf-8')
        tag_seq = generateTagSeq(sentence_unicode, new_entities)
        feature_string, tags = generateFeature.boundaryFeatureGeneration(
            new_sentence, [], ebao_dic, 'demo', '0')
        try:
            instances = feature_string.strip().split('\n')
        except AttributeError as e:
            print 'feature_string:%s.' % line
        xseq = crfsuite.ItemSequence()
        features = []
        for instance in instances:
            fields = instance.split('\t')
            features.append(fields[2:])
            item = crfsuite.Item()
            for field in fields[2:]:
                item.append(crfsuite.Attribute(field))
            xseq.append(item)
        tagger_bp.set(xseq)

        yseq_b = tagger_bp.viterbi()
        length = len(yseq_b)

        yseq = []
        for i in range(length):
            yseq.append(yseq_b[i])
        # 标记优化处理
        sen_ent_list1, start_end_list1 = evaluation.generateEntList([yseq_b])
        sen_ent_list2, start_end_list2 = evaluation.generateEntList([tag_seq])
        tagged_ents_length = len(start_end_list1[0])
        if tagged_ents_length == 0: continue

        ents = []
        selected_entity = 0
        ent_index = 0
        for i in range(tagged_ents_length):
            ent_start = start_end_list1[0][i][0]
            if ent_start < ent_index: continue
            flag = 0
            ent_end = start_end_list1[0][i][1]
            ent_content = sentence_unicode[ent_start:ent_end].encode('utf-8')
            ie_entity = 0.0
            for j in range(ent_start, ent_end):
                ie = 0.0  # 信息熵
                for ent_tag in bieso:
                    tag_prob = tagger_bp.marginal(ent_tag, j)
                    ie += tag_prob * math.log(1 / tag_prob, 2)
                ie_entity += ie
            # ie_ave = ie_entity / (ent_end - ent_start)
            # if ebao_dic.has_key(ent_content) and ie_ave > ie_value:
            if ie_entity > ie_value:
                for k in range(len(start_end_list2[0])):
                    start_m = start_end_list2[0][k][0]
                    end_m = start_end_list2[0][k][1]
                    if ent_start >= start_m and ent_end <= end_m:
                        # if end_m - start_m < 3: break
                        ents.append(
                            Entity(
                                sentence_unicode[start_m:end_m].encode(
                                    'utf-8'), int(start_m), int(end_m),
                                'entity'))
                        ent_index = end_m
                        flag = 1
                        break
                if flag == 0:
                    continue

                    if not ebao_dic.has_key(ent_content): continue
                    ents.append(
                        Entity(ent_content, int(ent_start), int(ent_end),
                               'entity'))
                    ent_index = end_m
                selected_entity += 1

        if selected_entity == 0: continue

        char_entity_tag_list = generateFeature.getCharEntityFPTag(
            sentence_unicode, ents, '1')
        char_entity_tag_list = generateFeature.getCharEntityPartialTag(
            char_entity_tag_list)

        new_feature_str = ''
        for j in range(length):
            new_feature_str += '%s\t%s\n' % (char_entity_tag_list[j][1][0],
                                             '\t'.join(features[j]))

        cdd4training_semi.append(new_feature_str.strip())
        cdd4training_semi_number += 1
    return cdd4training_semi, cdd4training_semi_number